diff --git a/libx/jmimemagic-0.0.4a.jar b/libx/jmimemagic-0.0.4a.jar deleted file mode 100644 index 9fb2355d3..000000000 Binary files a/libx/jmimemagic-0.0.4a.jar and /dev/null differ diff --git a/libx/jmimemagic-0.1.0.jar b/libx/jmimemagic-0.1.0.jar new file mode 100644 index 000000000..6e83304fd Binary files /dev/null and b/libx/jmimemagic-0.1.0.jar differ diff --git a/libx/jmimemagic-0.0.4a.license b/libx/jmimemagic-0.1.0.license similarity index 99% rename from libx/jmimemagic-0.0.4a.license rename to libx/jmimemagic-0.1.0.license index b1e3f5a26..f3f1b3b65 100644 --- a/libx/jmimemagic-0.0.4a.license +++ b/libx/jmimemagic-0.1.0.license @@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. - + Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a @@ -111,7 +111,7 @@ modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. - + GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION @@ -158,7 +158,7 @@ Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. - + 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 @@ -216,7 +216,7 @@ instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. - + Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. @@ -267,7 +267,7 @@ Library will still fall under Section 6.) distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. - + 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work @@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. - + 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined @@ -370,7 +370,7 @@ subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. - + 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or @@ -422,7 +422,7 @@ conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. - + 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is @@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS - + How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest diff --git a/source/de/anomic/plasma/parser/mimeType/build.xml b/source/de/anomic/plasma/parser/mimeType/build.xml index 20c20f6a9..2dcbe50d8 100644 --- a/source/de/anomic/plasma/parser/mimeType/build.xml +++ b/source/de/anomic/plasma/parser/mimeType/build.xml @@ -16,9 +16,10 @@ - + + @@ -28,7 +29,7 @@ - + diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 6d5eabc33..df21396f7 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -55,6 +55,7 @@ import org.apache.log4j.Logger; import net.sf.jmimemagic.Magic; import net.sf.jmimemagic.MagicMatch; +import net.sf.jmimemagic.MagicMatchNotFoundException; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; @@ -86,7 +87,7 @@ implements Parser { * @see Parser#getLibxDependences() */ private static final String[] LIBX_DEPENDENCIES = new String[] { - "jmimemagic-0.0.4a.jar", + "jmimemagic-0.1.0.jar", "jakarta-oro-2.0.7.jar", "log4j-1.2.9.jar", "xerces.jar" @@ -106,9 +107,8 @@ implements Parser { public String getMimeType (File sourceFile) { String mimeType = null; - try { - Magic theMagic = new Magic(); - MagicMatch match = theMagic.getMagicMatch(sourceFile); + try { + MagicMatch match = Magic.getMagicMatch(sourceFile,true); // if a match was found we can return the new mimeType if (match!=null) { @@ -145,10 +145,8 @@ implements Parser { // deactivating the logging for jMimeMagic Logger jmimeMagicLogger = Logger.getLogger("net.sf.jmimemagic"); jmimeMagicLogger.setLevel(Level.OFF); - - Magic theMagic = new Magic(); - MagicMatch match = theMagic.getMagicMatch(sourceFile); - + + MagicMatch match = Magic.getMagicMatch(sourceFile,true,false); // if a match was found we can return the new mimeType if (match!=null) { @@ -172,7 +170,8 @@ implements Parser { return theParser.parseSource(location,mimeType,charset,sourceFile); } throw new ParserException("Unable to detect mimetype of resource.",location); - + } catch (MagicMatchNotFoundException e) { + throw new ParserException("Unable to detect mimetype of resource.",location); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; diff --git a/source/de/anomic/plasma/parser/mimeType/odtDetector.java b/source/de/anomic/plasma/parser/mimeType/odtDetector.java new file mode 100644 index 000000000..1b709ce38 --- /dev/null +++ b/source/de/anomic/plasma/parser/mimeType/odtDetector.java @@ -0,0 +1,112 @@ +//odtDetector.java +//------------------------ +//part of YaCy +//(C) by Michael Peter Christen; mc@anomic.de +//first published on http://www.anomic.de +//Frankfurt, Germany, 2005 +// +//this file is contributed by Martin Thelian +//last major change: 16.05.2005 +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//Using this software in any meaning (reading, learning, copying, compiling, +//running) means that you agree that the Author(s) is (are) not responsible +//for cost, loss of data or any harm that may be caused directly or indirectly +//by usage of this softare or this documentation. The usage of this software +//is on your own risk. The installation and usage (starting/running) of this +//software may allow other people or application to access your computer and +//any attached devices and is highly dependent on the configuration of the +//software which must be done by the user of the software; the author(s) is +//(are) also not responsible for proper configuration and usage of the +//software, even if provoked by documentation provided together with +//the software. +// +//Any changes to this file according to the GPL as documented in the file +//gpl.txt aside this file in the shipment you received can be done to the +//lines that follows this copyright notice here, but changes must not be +//done inside the copyright notive above. A re-distribution must contain +//the intact and unchanged copyright notice. +//Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma.parser.mimeType; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +import net.sf.jmimemagic.MagicDetector; +import de.anomic.server.serverFileUtils; + +public class odtDetector implements MagicDetector { + + public String getDisplayName() { + return "ODT MimeType Detector"; + } + + public String[] getHandledExtensions() { + return new String[]{"zip","odt"}; + } + + public String[] getHandledTypes() { + return new String[] { "application/vnd.oasis.opendocument.text", "application/x-vnd.oasis.opendocument.text" }; + } + + public String getName() { + return "odtfiledetector"; + } + + public String getVersion() { + return "0.1"; + } + + public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { + File dstFile = null; + try { + dstFile = File.createTempFile("mimeTypeParser",".tmp"); + serverFileUtils.write(data,dstFile); + return process(dstFile, offset, length, bitmask, comparator, mimeType, params); + } catch (IOException e) { + return null; + } finally { + if (dstFile != null) {dstFile.delete();} + } + } + + public String[] process(File file, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { + try { + // opening the zip file + ZipFile zipFile = new ZipFile(file); + + // searching for a file named mimetype + ZipEntry mimeTypeInfo = zipFile.getEntry("mimetype"); + if (mimeTypeInfo == null) return null; + + // read in the content of the file + InputStream zippedContent = zipFile.getInputStream(mimeTypeInfo); + String realMimeType = new String(serverFileUtils.read(zippedContent, mimeTypeInfo.getSize())); + + return new String[]{realMimeType}; + } catch (Exception e) { + return null; + } + + } + +} diff --git a/source/de/anomic/plasma/parser/mimeType/rssDetector.java b/source/de/anomic/plasma/parser/mimeType/rssDetector.java new file mode 100644 index 000000000..d952e92e7 --- /dev/null +++ b/source/de/anomic/plasma/parser/mimeType/rssDetector.java @@ -0,0 +1,118 @@ +//rssDetector.java +//------------------------ +//part of YaCy +//(C) by Michael Peter Christen; mc@anomic.de +//first published on http://www.anomic.de +//Frankfurt, Germany, 2005 +// +//this file is contributed by Martin Thelian +//last major change: 16.05.2005 +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//Using this software in any meaning (reading, learning, copying, compiling, +//running) means that you agree that the Author(s) is (are) not responsible +//for cost, loss of data or any harm that may be caused directly or indirectly +//by usage of this softare or this documentation. The usage of this software +//is on your own risk. The installation and usage (starting/running) of this +//software may allow other people or application to access your computer and +//any attached devices and is highly dependent on the configuration of the +//software which must be done by the user of the software; the author(s) is +//(are) also not responsible for proper configuration and usage of the +//software, even if provoked by documentation provided together with +//the software. +// +//Any changes to this file according to the GPL as documented in the file +//gpl.txt aside this file in the shipment you received can be done to the +//lines that follows this copyright notice here, but changes must not be +//done inside the copyright notive above. A re-distribution must contain +//the intact and unchanged copyright notice. +//Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma.parser.mimeType; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.lang.reflect.Method; +import java.util.Map; + +import net.sf.jmimemagic.MagicDetector; + +public class rssDetector implements MagicDetector { + + public String getDisplayName() { + return "RSS MimeType Detector"; + } + + public String[] getHandledExtensions() { + return new String[]{"xml","rss","rdf","atom"}; + } + + public String[] getHandledTypes() { + return new String[] { "text/rss", "application/rdf+xml", "application/rss+xml", "application/atom+xml" }; + } + + public String getName() { + return "rssfiledetector"; + } + + public String getVersion() { + return "0.1"; + } + + public String[] process(File file, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { + FileInputStream fileInput = null; + try { + fileInput = new FileInputStream(file); + return detect(fileInput); + } catch (Exception e) { + return null; + } finally { + if (fileInput != null) try { fileInput.close(); } catch (Exception e) { /* ignore this */ } + } + } + + public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { + ByteArrayInputStream input = new ByteArrayInputStream(data); + return detect(input); + } + + private String[] detect(InputStream input) { + try { + + // getting the format detector class + Class formatDetector = Class.forName("de.nava.informa.utils.FormatDetector"); + + // getting the proper method + Method getFormat = formatDetector.getMethod("getFormat", new Class[]{InputStream.class}); + + // invoke the method + Object format = getFormat.invoke(null, new Object[] {input}); + + if (format == null) return null; + else if (format.toString().startsWith("RSS ")) return new String[]{"application/rss+xml"}; + else if (format.toString().startsWith("Atom ")) return new String[]{"application/atom+xml"}; + else return null; + } catch (Exception e) { + return null; + } catch (Error e) { + return null; + } + } + +} diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index c70c4e26c..023f38b40 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -45,7 +45,6 @@ package de.anomic.plasma.parser.tar; import java.io.File; import java.io.InputStream; -import de.anomic.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; @@ -57,6 +56,7 @@ import java.util.zip.GZIPInputStream; import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; +import de.anomic.net.URL; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; @@ -109,6 +109,8 @@ public class tarParser extends AbstractParser implements Parser { source = new GZIPInputStream(source); } + // TODO: what about bzip .... + StringBuffer docKeywords = new StringBuffer(); StringBuffer docShortTitle = new StringBuffer(); StringBuffer docLongTitle = new StringBuffer(); @@ -154,7 +156,7 @@ public class tarParser extends AbstractParser implements Parser { // parsing the content theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile); } catch (ParserException e) { - this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getErrorCode()); + this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage()); } finally { if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} } diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 7b55085d8..aca8f6505 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -140,7 +140,7 @@ public class zipParser extends AbstractParser implements Parser { // parsing the zip file entry theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile); } catch (ParserException e) { - this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getErrorCode()); + this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage()); } finally { if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} } @@ -185,7 +185,7 @@ public class zipParser extends AbstractParser implements Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; - throw new ParserException("Unexpected error while parsing zip resource. " + e.getMessage(),location); + throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 0e5933193..536307b8a 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -50,7 +50,6 @@ import java.io.FileFilter; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; @@ -71,16 +70,13 @@ import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; -import de.anomic.http.httpHeader; import de.anomic.http.httpc; -import de.anomic.index.indexURL; import de.anomic.net.URL; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserInfo; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; -import de.anomic.tools.bitfield; public final class plasmaParser { public static final String PARSER_MODE_PROXY = "PROXY"; @@ -512,7 +508,7 @@ public final class plasmaParser { // testing if parsing is supported for this resource if (!plasmaParser.supportedContent(location,mimeType)) { - String errorMsg = "No parser available to parse mimetype"; + String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); } @@ -588,7 +584,7 @@ public final class plasmaParser { } else if (realtimeParsableMimeTypesContains(mimeType)) { doc = parseHtml(location, mimeType, documentCharset, sourceFile); } else { - String errorMsg = "No parser available to parse mimetype"; + String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index efed8fbba..4c6da8f24 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -434,11 +434,15 @@ public class plasmaSnippetCache { try { if (resource == null) return null; - // try to get the header from the htcache directory + // if no resource metadata is available, try to load it if (docInfo == null) { - try { + // try to get the header from the htcache directory + try { docInfo = this.cacheManager.loadResourceInfo(url); } catch (Exception e) {} + + // TODO: try to load it from web + } if (docInfo == null) { diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index f35b56133..1679e2351 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -274,8 +274,14 @@ public final class serverFileUtils { } public static byte[] read(InputStream source) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - copy(source, baos, -1); + return read(source,-1); + } + + public static byte[] read(InputStream source, long count) throws IOException { + ByteArrayOutputStream baos = (count > 0) + ? new ByteArrayOutputStream((int)count) + : new ByteArrayOutputStream(); + copy(source, baos, count); baos.close(); return baos.toByteArray(); }