diff --git a/htroot/Settings_Parser.inc b/htroot/Settings_Parser.inc index 5b2a38a62..2953b68cd 100644 --- a/htroot/Settings_Parser.inc +++ b/htroot/Settings_Parser.inc @@ -7,23 +7,30 @@ For a detailed description of the various MIME-types take a look at Activate Mime-Type -Parser Class Name +Parser Usage #{parser}# - - #[mime]# - #[shortname]# + #[name]# V#[version]# + #[usage]# +   + +#{mime}# + + + #[mimetype]# +   - + +#{/mime}# #{/parser}# - + Enable all parsers   - +  Changes take effect immediately diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 1ed1b2643..af3735ee5 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -45,13 +45,16 @@ import java.util.Arrays; import java.util.Collections; +import java.util.Enumeration; import java.util.HashMap; +import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.ParserInfo; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -249,29 +252,42 @@ public final class Settings_p { * Parser Configuration */ plasmaSwitchboard sb = (plasmaSwitchboard)env; - Hashtable enabledParsers = sb.parser.getEnabledParserList(); - Hashtable availableParsers = sb.parser.getAvailableParserList(); + HashSet enabledParsers = sb.parser.getEnabledParserList(); + HashSet parserInfos = new HashSet(sb.parser.getAvailableParserList().values()); - // fetching a list of all available mimetypes - List availableParserKeys = Arrays.asList(availableParsers.keySet().toArray(new String[availableParsers.size()])); - - // sort it - Collections.sort(availableParserKeys); +// // fetching a list of all available mimetypes +// List availableParserKeys = Arrays.asList(availableParsers.entrySet().toArray(new ParserInfo[availableParsers.size()])); +// +// // sort it +// Collections.sort(availableParserKeys); // loop through the mimeTypes and add it to the properties boolean allParsersEnabled = true; int parserIdx = 0; - Iterator availableParserIter = availableParserKeys.iterator(); + + Iterator availableParserIter = parserInfos.iterator(); while (availableParserIter.hasNext()) { - String mimeType = (String) availableParserIter.next(); - String parserName = (String) availableParsers.get(mimeType); - boolean parserIsEnabled = enabledParsers.containsKey(mimeType); + ParserInfo parserInfo = (ParserInfo) availableParserIter.next(); + prop.put("parser_" + parserIdx + "_name", parserInfo.parserName); + prop.put("parser_" + parserIdx + "_version", parserInfo.parserVersionNr); + prop.put("parser_" + parserIdx + "_usage", Integer.toString(parserInfo.usageCount)); - prop.put("parser_" + parserIdx + "_mime", mimeType); - prop.put("parser_" + parserIdx + "_name", parserName); - prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1)); - prop.put("parser_" + parserIdx + "_status", parserIsEnabled ? 1:0); - allParsersEnabled &= parserIsEnabled; + int mimeIdx = 0; + Enumeration mimeTypeIter = parserInfo.supportedMimeTypes.keys(); + while (mimeTypeIter.hasMoreElements()) { + String mimeType = (String)mimeTypeIter.nextElement(); + + boolean parserIsEnabled = enabledParsers.contains(mimeType); + + prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType); + //prop.put("parser_" + parserIdx + "_name", parserName); + //prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1)); + prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", enabledParsers.contains(mimeType) ? 1:0); + allParsersEnabled &= parserIsEnabled; + + mimeIdx++; + } + prop.put("parser_" + parserIdx + "_mime", mimeIdx); parserIdx++; } diff --git a/httpd.mime b/httpd.mime index 1327ce1d5..157655749 100644 --- a/httpd.mime +++ b/httpd.mime @@ -37,6 +37,7 @@ mov = video/quicktime mpe = video/mpeg mpeg = video/mpeg mpg = video/mpeg +odt = application/vnd.oasis.opendocument.text ogg = audio/ogg-vorbis pac = application/x-ns-proxy-autoconfig pdf = application/pdf @@ -70,6 +71,7 @@ tif = image/tiff tiff = image/tiff torrent = application/x-bittorrent txt = text/plain +vcf = text/x-vcard wav = audio/x-wav xhtml = application/xhtml+xml xla = application/msexcel diff --git a/libx/jmimemagic-0.0.4a.jar b/libx/jmimemagic-0.0.4a.jar index b3cbfea22..66d420782 100644 Binary files a/libx/jmimemagic-0.0.4a.jar and b/libx/jmimemagic-0.0.4a.jar differ diff --git a/libx/odf_utils_05_11_10.jar b/libx/odf_utils_05_11_10.jar new file mode 100644 index 000000000..4e7a174f3 Binary files /dev/null and b/libx/odf_utils_05_11_10.jar differ diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 5860a3da1..3eeb80df4 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -346,7 +346,7 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http int argc; if (argsString == null) { // no args here, maybe a POST with multipart extension - int length; + int length = 0; //System.out.println("HEADER: " + requestHeader.toString()); // DEBUG if (method.equals(httpHeader.METHOD_POST)) { @@ -356,10 +356,11 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http } else if (requestHeader.gzip()) { length = -1; gzipBody = new GZIPInputStream(body); - } else { - httpd.sendRespondError(conProp,out,4,403,null,"bad post values",null); - return; } +// } else { +// httpd.sendRespondError(conProp,out,4,403,null,"bad post values",null); +// return; +// } // if its a POST, it can be either multipart or as args in the body if ((requestHeader.containsKey(httpHeader.CONTENT_TYPE)) && @@ -438,7 +439,7 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http } }else{ //you cannot share a .png/.gif file with a name like a class in htroot. - if ( !(targetFile.exists()) && !((path.endsWith("png")||path.endsWith("gif"))&&targetClass!=null ) ){ + if ( !(targetFile.exists()) && !((path.endsWith("png")||path.endsWith("gif")||path.endsWith(".stream"))&&targetClass!=null ) ){ targetFile = new File(htDocsPath, path); targetClass = rewriteClassFile(new File(htDocsPath, path)); } @@ -486,6 +487,20 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http Thread.currentThread().sleep(200); // see below serverFileUtils.write(result, out); } + } else if ((targetClass != null) && (path.endsWith(".stream"))) { + // call rewrite-class + requestHeader.put("CLIENTIP", conProp.getProperty("CLIENTIP")); + requestHeader.put("PATH", path); + requestHeader.put("INPUTSTREAM", body); + requestHeader.put("OUTPUTSTREAM", out); + + httpd.sendRespondHeader(this.connectionProperties, out, httpVersion, 200, null); + + // in case that there are no args given, args = null or empty hashmap + serverObjects tp = (serverObjects) rewriteMethod(targetClass).invoke(null, new Object[] {requestHeader, args, switchboard}); + + this.forceConnectionClose(); + return; } else if ((targetFile.exists()) && (targetFile.canRead())) { // we have found a file that can be written to the client // if this file uses templates, then we use the template diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index f7ed3ab35..e7defd382 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -73,6 +73,16 @@ public abstract class AbstractParser implements Parser{ * purposes. */ protected serverLog theLogger = null; + + /** + * Version number of the parser + */ + protected String parserVersionNr = "0.1"; + + /** + * Parser name + */ + protected String parserName = this.getClass().getSimpleName(); /** * The Constructor of this class. @@ -165,4 +175,18 @@ public abstract class AbstractParser implements Parser{ this.theLogger = log; } + /** + * Returns the version number of the parser + * @return parser version number + */ + public String getVersion() { + return this.parserVersionNr; + } + + /** + * Return the name of the parser + */ + public String getName() { + return parserName; + } } diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 038b88505..dd3875d2a 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -122,4 +122,17 @@ public interface Parser { */ public void setLogger(serverLog log); + /** + * Returns the version number of the current parser + * @return parser version number + */ + public String getVersion(); + + /** + * Returns the name of the parser + * @return parser name + */ + public String getName(); } + + diff --git a/source/de/anomic/plasma/parser/ParserInfo.java b/source/de/anomic/plasma/parser/ParserInfo.java new file mode 100644 index 000000000..a7e68663a --- /dev/null +++ b/source/de/anomic/plasma/parser/ParserInfo.java @@ -0,0 +1,34 @@ +package de.anomic.plasma.parser; + +import java.util.Hashtable; + +public class ParserInfo { + // general parser info + public Class parserClass; + public String parserClassName; + + public String parserName; + public String parserVersionNr; + + // parser properties + public String[] libxDependencies; + public Hashtable supportedMimeTypes; + + // usage statistic + public int usageCount = 0; + + public String toString() { + StringBuffer toStr = new StringBuffer(); + + toStr.append(this.parserName).append(" V") + .append((this.parserVersionNr==null)?"0.0":this.parserVersionNr).append(" | ") + .append(this.parserClassName).append(" | ") + .append(this.supportedMimeTypes); + + return toStr.toString(); + } + + public synchronized void incUsageCounter() { + this.usageCount++; + } +} diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index e9e378b81..887932f5e 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -77,9 +77,10 @@ public class bzipParser extends AbstractParser implements Parser { private static final String[] LIBX_DEPENDENCIES = new String[] { "bzip2.jar" }; - + public bzipParser() { super(LIBX_DEPENDENCIES); + parserName = "Bzip 2 UNIX Compressed File Parser"; } public Hashtable getSupportedMimeTypes() { @@ -105,7 +106,6 @@ public class bzipParser extends AbstractParser implements Parser { int read = 0; byte[] data = new byte[1024]; - CBZip2InputStream zippedContent = new CBZip2InputStream(source); tempFile = File.createTempFile("bunzip","tmp"); diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 0ed57ad5f..c339ad006 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -75,6 +75,7 @@ implements Parser { public docParser() { super(LIBX_DEPENDENCIES); + parserName = "Word Document Parser"; } public plasmaParserDocument parse(URL location, String mimeType, diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index f33993f1f..c3028d561 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -76,6 +76,7 @@ public class gzipParser extends AbstractParser implements Parser { public gzipParser() { super(LIBX_DEPENDENCIES); + parserName = "GNU Zip Compressed Archive Parser"; } public Hashtable getSupportedMimeTypes() { diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 910ac7f3c..6f2f73454 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -47,6 +47,7 @@ import java.io.File; import java.io.InputStream; import java.net.URL; import java.util.Collection; +import java.util.HashMap; import java.util.Hashtable; import org.apache.log4j.Level; @@ -74,7 +75,10 @@ implements Parser { static { SUPPORTED_MIME_TYPES.put("text/xml","xml"); SUPPORTED_MIME_TYPES.put("application/xml","xml"); - SUPPORTED_MIME_TYPES.put("application/octet-stream",""); + SUPPORTED_MIME_TYPES.put("application/x-xml","xml"); + SUPPORTED_MIME_TYPES.put("application/octet-stream",""); + SUPPORTED_MIME_TYPES.put("application/x-compress",""); + SUPPORTED_MIME_TYPES.put("application/x-compressed",""); } /** @@ -88,14 +92,56 @@ implements Parser { "xerces.jar" }; + /** + * Helping structure used to detect loops in the mimeType detection + * process + */ + private static Hashtable threadLoopDetection = new Hashtable(); + public mimeTypeParser() { super(LIBX_DEPENDENCIES); + parserName = "MimeType Parser"; + } + + public String getMimeType (File sourceFile) { + String mimeType = null; + + try { + Magic theMagic = new Magic(); + MagicMatch match = theMagic.getMagicMatch(sourceFile); + + // if a match was found we can return the new mimeType + if (match!=null) { + Collection subMatches = match.getSubMatches(); + if ((subMatches != null) && (!subMatches.isEmpty())) { + mimeType = ((MagicMatch) subMatches.iterator().next()).getMimeType(); + } else { + mimeType = match.getMimeType(); + } + return mimeType; + } + } catch (Exception e) { + + } + return null; } public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException { + String orgMimeType = mimeType; + // determining the mime type of the file ... try { + // adding current thread to loop detection list + Integer loopDepth = null; + if (threadLoopDetection.containsKey(Thread.currentThread())) { + loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread()); + } else { + loopDepth = new Integer(0); + } + if (loopDepth.intValue() > 5) return null; + threadLoopDetection.put(Thread.currentThread(),new Integer(loopDepth.intValue()+1)); + // deactivating the logging for jMimeMagic Logger theLogger = Logger.getLogger("net.sf.jmimemagic"); theLogger.setLevel(Level.OFF); @@ -115,6 +161,7 @@ implements Parser { // to avoid loops we have to test if the mimetype has changed ... if (this.getSupportedMimeTypes().containsKey(mimeType)) return null; + if (orgMimeType.equals(mimeType)) return null; plasmaParser theParser = new plasmaParser(); return theParser.parseSource(location,mimeType,sourceFile); @@ -123,6 +170,13 @@ implements Parser { } catch (Exception e) { return null; + } finally { + Integer loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread()); + if (loopDepth.intValue() <= 1) { + threadLoopDetection.remove(Thread.currentThread()); + } else { + threadLoopDetection.put(Thread.currentThread(), new Integer(loopDepth.intValue()-1)); + } } } diff --git a/source/de/anomic/plasma/parser/odt/build.xml b/source/de/anomic/plasma/parser/odt/build.xml new file mode 100644 index 000000000..25ae3e5d5 --- /dev/null +++ b/source/de/anomic/plasma/parser/odt/build.xml @@ -0,0 +1,55 @@ + + + + A class to parse gzip files + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java new file mode 100644 index 000000000..3b9708472 --- /dev/null +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -0,0 +1,214 @@ +//zipParser.java +//------------------------ +//part of YaCy +//(C) by Michael Peter Christen; mc@anomic.de +//first published on http://www.anomic.de +//Frankfurt, Germany, 2005 +// +//this file is contributed by Martin Thelian +//last major change: 16.05.2005 +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//Using this software in any meaning (reading, learning, copying, compiling, +//running) means that you agree that the Author(s) is (are) not responsible +//for cost, loss of data or any harm that may be caused directly or indirectly +//by usage of this softare or this documentation. The usage of this software +//is on your own risk. The installation and usage (starting/running) of this +//software may allow other people or application to access your computer and +//any attached devices and is highly dependent on the configuration of the +//software which must be done by the user of the software; the author(s) is +//(are) also not responsible for proper configuration and usage of the +//software, even if provoked by documentation provided together with +//the software. +// +//Any changes to this file according to the GPL as documented in the file +//gpl.txt aside this file in the shipment you received can be done to the +//lines that follows this copyright notice here, but changes must not be +//done inside the copyright notive above. A re-distribution must contain +//the intact and unchanged copyright notice. +//Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma.parser.odt; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.InputStream; +import java.net.URL; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +import com.catcode.odf.ODFMetaFileAnalyzer; +import com.catcode.odf.OpenDocumentMetadata; +import com.catcode.odf.OpenDocumentTextInputStream; + +import de.anomic.http.httpc; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; +import de.anomic.plasma.parser.Parser; +import de.anomic.plasma.parser.ParserException; +import de.anomic.server.serverFileUtils; +import de.anomic.server.logging.serverLog; + +public class odtParser extends AbstractParser implements Parser { + + /** + * a list of mime types that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { + SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt"); + SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt"); + } + + /** + * a list of library names that are needed by this parser + * @see Parser#getLibxDependences() + */ + private static final String[] LIBX_DEPENDENCIES = new String[] {"odf_utils_05_11_10.jar"}; + + public odtParser() { + super(LIBX_DEPENDENCIES); + parserName = "OASIS OpenDocument V2 Text Document Parser"; + } + + public Hashtable getSupportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException { + + try { + byte[] docContent = null; + String docDescription = null; + String docKeywords = null; + String docShortTitle = null; + String docLongTitle = null; + + // opening the file as zip file + ZipFile zipFile= new ZipFile(dest); + Enumeration zipEnum = zipFile.entries(); + + // looping through all containing files + while (zipEnum.hasMoreElements()) { + ZipEntry zipEntry= (ZipEntry) zipEnum.nextElement(); + String entryName = zipEntry.getName(); + + // content.xml contains the document content in xml format + if (entryName.equals("content.xml")) { + InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); + OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream); + docContent = serverFileUtils.read(odStream); + + // meta.xml contains metadata about the document + } else if (entryName.equals("meta.xml")) { + InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); + ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer(); + OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream); + docDescription = metaData.getDescription(); + docKeywords = metaData.getKeyword(); + docShortTitle = metaData.getTitle(); + docLongTitle = metaData.getSubject(); + + // if there is no title availabe we generate one + if (docLongTitle == null) { + if (docShortTitle != null) { + docLongTitle = docShortTitle; + } else if (docContent.length <= 80) { + docLongTitle = new String(docContent); + } else { + byte[] title = new byte[80]; + System.arraycopy(docContent, 0, title, 0, 80); + docLongTitle = new String(title); + } + docLongTitle. + replaceAll("\r\n"," "). + replaceAll("\n"," "). + replaceAll("\r"," "). + replaceAll("\t"," "); + } + } + } + + return new plasmaParserDocument( + location, + mimeType, + docKeywords, + docShortTitle, + docLongTitle, + null, + docDescription, + docContent, + null, + null); + } catch (Exception e) { + e.printStackTrace(); + throw new ParserException("Unable to parse the odt content. " + e.getMessage()); + } catch (Error e) { + throw new ParserException("Unable to parse the odt content. " + e.getMessage()); + } + } + + public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { + File dest = null; + try { + // creating a tempfile + dest = File.createTempFile("OpenDocument", ".odt"); + dest.deleteOnExit(); + + // copying the stream into a file + serverFileUtils.copy(source, dest); + + // parsing the content + return parse(location, mimeType, dest); + } catch (Exception e) { + throw new ParserException("Unable to parse the odt document. " + e.getMessage()); + } finally { + if (dest != null) try { dest.delete(); } catch (Exception e){} + } + } + + public void reset() { + // Nothing todo here at the moment + + } + + public static void main(String[] args) { + try { + if (args.length != 1) return; + + // getting the content URL + URL contentUrl = new URL(args[0]); + + // creating a new parser + odtParser testParser = new odtParser(); + + // setting the parser logger + testParser.setLogger(new serverLog("PARSER.ODT")); + + // downloading the document content + byte[] content = httpc.singleGET(contentUrl, 10000, null, null, null); + ByteArrayInputStream input = new ByteArrayInputStream(content); + + // parsing the document + testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input); + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 70db05b45..706d41413 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -74,10 +74,11 @@ public class pdfParser extends AbstractParser implements Parser { */ private static final String[] LIBX_DEPENDENCIES = new String[] { "PDFBox-0.7.2.jar" - }; + }; public pdfParser() { super(LIBX_DEPENDENCIES); + parserName = "Acrobat Portable Document Parser"; } public Hashtable getSupportedMimeTypes() { diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index dd2d8121c..1197cafa8 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -91,10 +91,11 @@ public class rssParser extends AbstractParser implements Parser { "informa-0.6.0.jar", "commons-logging.jar", "jdom.jar" - }; + }; public rssParser() { super(LIBX_DEPENDENCIES); + parserName = "Rich Site Summary/Atom Feed Parser"; } public plasmaParserDocument parse(URL location, String mimeType, diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 329605cce..667fc5285 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -73,10 +73,11 @@ implements Parser { * a list of library names that are needed by this parser * @see Parser#getLibxDependences() */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + private static final String[] LIBX_DEPENDENCIES = new String[] {}; public rtfParser() { super(LIBX_DEPENDENCIES); + parserName = "Rich Text Format Parser"; } public plasmaParserDocument parse(URL location, String mimeType, diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 71dafb99f..10d512425 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -44,13 +44,16 @@ package de.anomic.plasma.parser.tar; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.InputStream; +import java.io.PushbackInputStream; import java.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; +import java.util.zip.GZIPInputStream; import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; @@ -61,6 +64,7 @@ import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverFileUtils; public class tarParser extends AbstractParser implements Parser { @@ -71,6 +75,7 @@ public class tarParser extends AbstractParser implements Parser { public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-tar","tar"); + SUPPORTED_MIME_TYPES.put("application/tar","tar"); } /** @@ -83,6 +88,7 @@ public class tarParser extends AbstractParser implements Parser { public tarParser() { super(LIBX_DEPENDENCIES); + parserName = "Tape Archive File Parser"; } public Hashtable getSupportedMimeTypes() { @@ -92,6 +98,18 @@ public class tarParser extends AbstractParser implements Parser { public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { try { + // creating a new parser class to parse the unzipped content + plasmaParser theParser = new plasmaParser(); + + /* + * If the mimeType was not reported correcly by the webserve we + * have to decompress it first + */ + String ext = plasmaParser.getFileExt(location).toLowerCase(); + if (ext.equals("gz") || ext.equals("tgz")) { + source = new GZIPInputStream(source); + } + StringBuffer docKeywords = new StringBuffer(); StringBuffer docShortTitle = new StringBuffer(); StringBuffer docLongTitle = new StringBuffer(); @@ -100,11 +118,7 @@ public class tarParser extends AbstractParser implements Parser { serverByteBuffer docText = new serverByteBuffer(); Map docAnchors = new HashMap(); Map docImages = new HashMap(); - - - // creating a new parser class to parse the unzipped content - plasmaParser theParser = new plasmaParser(); - + // looping through the contained files TarEntry entry; TarInputStream tin = new TarInputStream(source); @@ -113,22 +127,34 @@ public class tarParser extends AbstractParser implements Parser { if (entry.isDirectory()) continue; // Get the entry name - String entryName = entry.getName(); - int idx = entryName.lastIndexOf("."); - String entryExt = (idx > -1) ? entryName.substring(idx+1) : null; + int idx = -1; + String entryName = entry.getName(); + idx = entryName.lastIndexOf("/"); + if (idx != -1) entryName = entryName.substring(idx+1); + idx = entryName.lastIndexOf("."); + String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt); // getting the entry content - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - byte[] buf = new byte[(int) entry.getSize()]; - int bytesRead = tin.read(buf); - bos.write(buf); - byte[] ut = bos.toByteArray(); - - // parsing the content - plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut); + plasmaParserDocument theDoc = null; + File tempFile = null; + try { + + + byte[] buf = new byte[(int) entry.getSize()]; + int bytesRead = tin.read(buf); + + tempFile = File.createTempFile("tarParser_" + ((idx>-1)?entryName.substring(0,idx):entryName), (entryExt.length()>0)?"."+entryExt:entryExt); + serverFileUtils.write(buf, tempFile); + + // parsing the content + + theDoc = theParser.parseSource(tempFile.toURL(),entryMime,tempFile); + } finally { + if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){} + } if (theDoc == null) continue; // merging all documents together diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 0e1fe5b61..c46771f67 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -86,10 +86,11 @@ public class vcfParser extends AbstractParser implements Parser { * a list of library names that are needed by this parser * @see Parser#getLibxDependences() */ - private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"}; + private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"}; public vcfParser() { super(LIBX_DEPENDENCIES); + parserName = "vCard Parser"; } public Hashtable getSupportedMimeTypes() { diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index fa5cd2388..6372eec0d 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -79,10 +79,11 @@ public class zipParser extends AbstractParser implements Parser { * a list of library names that are needed by this parser * @see Parser#getLibxDependences() */ - private static final String[] LIBX_DEPENDENCIES = new String[] {}; + private static final String[] LIBX_DEPENDENCIES = new String[] {}; public zipParser() { super(LIBX_DEPENDENCIES); + parserName = "Compressed Archive File Parser"; } public Hashtable getSupportedMimeTypes() { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index e1551aaaf..41f8306ee 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -46,7 +46,6 @@ package de.anomic.plasma; import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; @@ -70,8 +69,10 @@ import java.util.Set; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; +import de.anomic.http.httpc; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; +import de.anomic.plasma.parser.ParserInfo; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; @@ -92,7 +93,7 @@ public final class plasmaParser { * @see #loadEnabledParserList() * @see #setEnabledParserList(Enumeration) */ - private static final Properties enabledParserList = new Properties(); + private static final HashSet enabledParserList = new HashSet(); /** * A list of file extensions that are supported by all enabled parsers @@ -104,12 +105,43 @@ public final class plasmaParser { * be parsed in realtime. */ private static final HashSet supportedRealtimeFileExt = new HashSet(); + + /** + * A list of mimeTypes that are generic + */ + private static final HashSet genericMimeTypes = new HashSet(); + static { + genericMimeTypes.add("text/plain"); + genericMimeTypes.add("text/text"); + genericMimeTypes.add("text/xml"); + genericMimeTypes.add("application/xml"); + genericMimeTypes.add("application/x-xml"); + genericMimeTypes.add("application/octet-stream"); + genericMimeTypes.add("application/zip"); + genericMimeTypes.add("application/x-zip"); + genericMimeTypes.add("application/x-zip-compressed"); + genericMimeTypes.add("application/x-compress"); + genericMimeTypes.add("application/x-compressed"); + } /** * A list of mimeTypes that can be parsed in Realtime (on the fly) */ private static final HashSet realtimeParsableMimeTypes = new HashSet(); + private static final Properties mimeTypeLookupByFileExt = new Properties(); + static { + // loading a list of extensions from file + BufferedInputStream bufferedIn = null; + try { + mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); + } catch (IOException e) { + System.err.println("ERROR: httpd.mime not found in settings path"); + } finally { + if (bufferedIn != null) try{bufferedIn.close();}catch(Exception e){} + } + } + /** * A pool of parsers. * @see plasmaParserPool @@ -162,7 +194,7 @@ public final class plasmaParser { // The maximum number of idle connections connections in the pool // 0 = no limit. - config.maxIdle = 10; + config.maxIdle = 5; config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK; config.minEvictableIdleTimeMillis = 30000; @@ -175,6 +207,8 @@ public final class plasmaParser { loadAvailableParserList(); } + private serverLog theLogger = new serverLog("PARSER"); + /** * This function is used to initialize the realtimeParsableMimeTypes List. * This list contains a list of mimeTypes that can be parsed in realtime by @@ -185,7 +219,7 @@ public final class plasmaParser { public static void initRealtimeParsableMimeTypes(String realtimeParsableMimeTypes) { LinkedList mimeTypes = new LinkedList(); if ((realtimeParsableMimeTypes == null) || (realtimeParsableMimeTypes.length() == 0)) { - + // Nothing todo here } else { String[] realtimeParsableMimeTypeList = realtimeParsableMimeTypes.split(","); for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); @@ -280,7 +314,7 @@ public final class plasmaParser { } synchronized (enabledParserList) { - return enabledParserList.containsKey(mimeType); + return enabledParserList.contains(mimeType); } } @@ -302,7 +336,7 @@ public final class plasmaParser { // termining last position of . in file path p = name.lastIndexOf('.'); - if (p < 0) return name; // seams to be strange, but this is a directory entry or default file (html) + if (p < 0) return ""; return name.substring(p + 1); } @@ -352,19 +386,8 @@ public final class plasmaParser { return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); } - public static String getMimeTypeByFileExt(String fileExt) { - // loading a list of extensions from file - Properties prop = new Properties(); - BufferedInputStream bufferedIn = null; - try { - prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); - } catch (IOException e) { - System.err.println("ERROR: httpd.mime not found in settings path"); - } finally { - if (bufferedIn != null) try{bufferedIn.close();}catch(Exception e){} - } - - return prop.getProperty(fileExt,"application/octet-stream"); + public static String getMimeTypeByFileExt(String fileExt) { + return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream"); } public plasmaParser() { @@ -373,7 +396,7 @@ public final class plasmaParser { public static String[] setEnabledParserList(Set mimeTypeSet) { - Properties newEnabledParsers = new Properties(); + HashSet newEnabledParsers = new HashSet(); HashSet newSupportedFileExt = new HashSet(); if (mimeTypeSet != null) { @@ -384,7 +407,7 @@ public final class plasmaParser { Parser theParser = null; try { // getting the parser - theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType)); + theParser = (Parser) plasmaParser.theParserPool.borrowObject(((ParserInfo)availableParserList.get(mimeType)).parserClassName); // getting a list of mimeTypes that the parser supports Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); @@ -397,7 +420,7 @@ public final class plasmaParser { newSupportedFileExt.addAll(Arrays.asList(extArray)); } } - newEnabledParsers.put(mimeType,availableParserList.get(mimeType)); + newEnabledParsers.add(mimeType); } catch (Exception e) { serverLog.logSevere("PARSER", "error in setEnabledParserList", e); @@ -411,7 +434,7 @@ public final class plasmaParser { synchronized (enabledParserList) { enabledParserList.clear(); - enabledParserList.putAll(newEnabledParsers); + enabledParserList.addAll(newEnabledParsers); } @@ -420,34 +443,18 @@ public final class plasmaParser { supportedFileExt.addAll(newSupportedFileExt); } - return (String[])newEnabledParsers.keySet().toArray(new String[newEnabledParsers.size()]); + return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]); } - public Hashtable getEnabledParserList() { + public HashSet getEnabledParserList() { synchronized (plasmaParser.enabledParserList) { - return (Hashtable) plasmaParser.enabledParserList.clone(); + return (HashSet) plasmaParser.enabledParserList.clone(); } } public Hashtable getAvailableParserList() { return plasmaParser.availableParserList; - } - - private static void loadEnabledParserList() { - // loading a list of availabe parser from file - Properties prop = new Properties(); - BufferedInputStream bufferedIn = null; - try { - prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser")))); - } catch (IOException e) { - System.err.println("ERROR: yacy.parser not found in settings path"); - } finally { - if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){} - } - - // enable them ... - setEnabledParserList(prop.keySet()); - } + } private static void loadAvailableParserList() { try { @@ -474,9 +481,11 @@ public final class plasmaParser { */ File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter); if (parserDirectories == null) return; + for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) { File currentDir = parserDirectories[parserDirNr]; serverLog.logFine("PARSER", "Searching in directory " + currentDir.toString()); + String[] parserClasses = currentDir.list(parserFileNameFilter); if (parserClasses == null) continue; @@ -506,12 +515,25 @@ public final class plasmaParser { // loading the list of mime-types that are supported by this parser class Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes(); + + // creating a parser info object + ParserInfo parserInfo = new ParserInfo(); + parserInfo.parserClass = parserClass; + parserInfo.parserClassName = fullClassName; + parserInfo.libxDependencies = neededLibx; + parserInfo.supportedMimeTypes = supportedMimeTypes; + parserInfo.parserVersionNr = ((Parser)theParser).getVersion(); + parserInfo.parserName = ((Parser)theParser).getName(); + Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); while (mimeTypeIterator.hasNext()) { String mimeType = (String) mimeTypeIterator.next(); - availableParserList.put(mimeType,fullClassName); + availableParserList.put(mimeType,parserInfo ); serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'." + - ((neededLibxBuf.length()>0)?"\n Dependencies: " + neededLibxBuf.toString():"")); + "\n\tName: " + parserInfo.parserName + + "\n\tVersion: " + parserInfo.parserVersionNr + + "\n\tClass: " + parserInfo.parserClassName + + ((neededLibxBuf.length()>0)?"\n\tDependencies: " + neededLibxBuf.toString():"")); } } catch (Exception e) { /* we can ignore this for the moment */ @@ -537,50 +559,19 @@ public final class plasmaParser { try { theParserPool.close(); } catch (Exception e) { } - } + } public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) { - - Parser theParser = null; + File tempFile = null; try { - mimeType = getRealMimeType(mimeType); - String fileExt = getFileExt(location); - - // TODO: Handling of not trustable mimeTypes - // text/plain, octet-stream - if ( - (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || - (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt")) - ) { - if (enabledParserList.containsKey("application/octet-stream")) { - mimeType = "application/octet-stream"; - } - } - - // getting the correct parser for the given mimeType - theParser = this.getParser(mimeType); - - // if a parser was found we use it ... - if (theParser != null) { - return theParser.parse(location, mimeType,source); - } else if (realtimeParsableMimeTypesContains(mimeType)) { - // ... otherwise we make a html scraper and transformer - htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); - OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - hfos.write(source); - hfos.close(); - return transformScraper(location, mimeType, scraper); - } else { - return null; - } - } catch (Exception e) { - //e.printStackTrace(); + tempFile = File.createTempFile("parseSource", ".tmp"); + return parseSource(location, mimeType, tempFile); + } catch (Exception e) { return null; } finally { - if ((theParser != null) && (supportedMimeTypesContains(mimeType))) { - try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) {} - } + if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){} } + } public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) { @@ -590,16 +581,58 @@ public final class plasmaParser { mimeType = getRealMimeType(mimeType); String fileExt = getFileExt(location); - // TODO: Handling of not trustable mimeTypes - // text/plain, octet-stream - if ( - (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || - (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt")) - ) { - if (enabledParserList.containsKey("application/octet-stream")) { - mimeType = "application/octet-stream"; - } - } + if (this.theLogger.isFine()) + this.theLogger.logFine("Parsing " + location + " with mimeType '" + mimeType + + "' and file extension '" + fileExt + "'."); + + /* + * There are some problematic mimeType - fileExtension combination where we have to enforce + * a mimeType detection to get the proper parser for the content + * + * - application/zip + .odt + * - text/plain + .odt + * - text/plain + .vcf + * - text/xml + .rss + * - text/xml + .atom + * + * In all these cases we can trust the fileExtension and have to determine the proper mimeType. + * + */ + +// // Handling of not trustable mimeTypes +// // - text/plain +// // - text/xml +// // - application/octet-stream +// // - application/zip +// if ( +// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || +// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt")) +// ) { +// if (this.theLogger.isFine()) +// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType + +// "' that seems not to be correct for file extension '" + fileExt + "'."); +// +// if (enabledParserList.containsKey("application/octet-stream")) { +// theParser = this.getParser("application/octet-stream"); +// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile); +// if (newMime == null) +// if (newMime instanceof String) { +// String newMimeType = (String)newMime; +// if ((newMimeType.equals("application/octet-stream")) { +// return null; +// } +// mimeType = newMimeType; +// } +// } else { +// return null; +// } +// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){ +// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) { +// mimeType = "application/vnd.oasis.opendocument.text"; +// } else { +// return null; +// } +// } // getting the correct parser for the given mimeType theParser = this.getParser(mimeType); @@ -647,16 +680,18 @@ public final class plasmaParser { * @param mimeType * @return */ - public Parser getParser(String mimeType) { + private Parser getParser(String mimeType) { mimeType = getRealMimeType(mimeType); try { // determining the proper parser class name for the mimeType String parserClassName = null; + ParserInfo parserInfo = null; synchronized (plasmaParser.enabledParserList) { - if (plasmaParser.enabledParserList.containsKey(mimeType)) { - parserClassName = (String)plasmaParser.enabledParserList.get(mimeType); + if (plasmaParser.enabledParserList.contains(mimeType)) { + parserInfo = (ParserInfo)plasmaParser.availableParserList.get(mimeType); + parserClassName = parserInfo.parserClassName; } else { return null; } @@ -668,6 +703,7 @@ public final class plasmaParser { // checking if the created parser really supports the given mimetype Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); if ((supportedMimeTypes != null) && (supportedMimeTypes.containsKey(mimeType))) { + parserInfo.incUsageCounter(); return theParser; } theParserPool.returnObject(parserClassName,theParser); @@ -740,10 +776,40 @@ public final class plasmaParser { //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out try { - File in = new File(args[0]); - //File out = new File(args[1]); + File contentFile = null; + URL contentURL = null; + String contentMimeType = "application/octet-stream"; + + if (args.length < 2) { + System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]"); + } + + String mode = args[0]; + if (mode.equalsIgnoreCase("-f")) { + contentFile = new File(args[1]); + contentURL = contentFile.toURL(); + } else if (mode.equalsIgnoreCase("-u")) { + contentURL = new URL(args[1]); + + // downloading the document content + byte[] contentBytes = httpc.singleGET(contentURL, 10000, null, null, null); + + contentFile = File.createTempFile("content",".tmp"); + contentFile.deleteOnExit(); + serverFileUtils.write(contentBytes, contentFile); + } + + if ((args.length == 4)&&(args[2].equalsIgnoreCase("-m"))) { + contentMimeType = args[3]; + } + + // creating a plasma parser plasmaParser theParser = new plasmaParser(); + + // configuring the realtime parsable mimeTypes plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain"); + + // configure all other supported mimeTypes plasmaParser.initParseableMimeTypes( "application/atom+xml," + "application/gzip," + @@ -763,14 +829,14 @@ public final class plasmaParser { "text/xml," + "application/x-bzip2," + "application/postscript," + - "text/x-vcard"); - FileInputStream theInput = new FileInputStream(in); - ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); - serverFileUtils.copy(theInput, theOutput); - plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.ps"), null, theOutput.toByteArray()); - //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); - //byte[] theText = document.getText(); - //serverFileUtils.write(theText, out); + "text/x-vcard," + + "application/vnd.oasis.opendocument.text," + + "application/x-vnd.oasis.opendocument.text"); + + // parsing the content + plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile); + + // printing out all parsed sentences if (document != null) { String[] sentences = document.getSentences(); if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index c7bc9ef3f..da3404b27 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -327,8 +327,9 @@ public class plasmaSnippetCache { if (header == null) { String filename = this.cacheManager.getCachePath(url).getName(); int p = filename.lastIndexOf('.'); - if ( + if ( // if no extension is available (p < 0) || + // or the extension is supported by one of the parsers ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1)))) ) { String supposedMime = "text/html"; diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java index 5f3338f79..e34443455 100644 --- a/source/de/anomic/server/logging/serverLog.java +++ b/source/de/anomic/server/logging/serverLog.java @@ -81,69 +81,98 @@ public final class serverLog { public void setLevel(Level newLevel) { this.theLogger.setLevel(newLevel); } - + public void logSevere(String message) {this.theLogger.severe(message);} public void logSevere(String message, Throwable thrown) {this.theLogger.log(Level.SEVERE,message,thrown);} + public boolean isSevere() { return this.theLogger.isLoggable(Level.SEVERE); } public void logWarning(String message) {this.theLogger.warning(message);} public void logWarning(String message, Throwable thrown) {this.theLogger.log(Level.WARNING,message,thrown);} - + public boolean isWarning() { return this.theLogger.isLoggable(Level.WARNING); } + public void logConfig(String message) {this.theLogger.config(message);} public void logConfig(String message, Throwable thrown) {this.theLogger.log(Level.CONFIG,message,thrown);} + public boolean isConfig() { return this.theLogger.isLoggable(Level.CONFIG); } public void logInfo(String message) {this.theLogger.info(message);} public void logInfo(String message, Throwable thrown) {this.theLogger.log(Level.INFO,message,thrown);} + public boolean isInfo() { return this.theLogger.isLoggable(Level.INFO); } public void logFine(String message) {this.theLogger.fine(message);} public void logFine(String message, Throwable thrown) {this.theLogger.log(Level.FINE,message,thrown);} + public boolean isFine() { return this.theLogger.isLoggable(Level.FINE); } public void logFiner(String message) {this.theLogger.finer(message);} - public void logFiner(String message, Throwable thrown) {this.theLogger.log(Level.FINER,message,thrown);} + public void logFiner(String message, Throwable thrown) {this.theLogger.log(Level.FINER,message,thrown);} + public boolean isFiner() { return this.theLogger.isLoggable(Level.FINER); } public void logFinest(String message) {this.theLogger.finest(message);} - public void logFinest(String message, Throwable thrown) {this.theLogger.log(Level.FINEST,message,thrown);} + public void logFinest(String message, Throwable thrown) {this.theLogger.log(Level.FINEST,message,thrown);} + public boolean isFinest() { return this.theLogger.isLoggable(Level.FINEST); } + + private void log(Level level, String msg, Throwable thrown) { + this.theLogger.log(level, msg, thrown); + } public boolean isLoggable(Level level) { return this.theLogger.isLoggable(level); } + // static log messages: log everything private static void log(String appName, int messageLevel, String message) { Logger.getLogger(appName).log(Level.parse(Integer.toString(messageLevel)),message); } - private void log(Level level, String msg, Throwable thrown) { - this.theLogger.log(level, msg, thrown); - } + public static void logSevere(String appName, String message) { Logger.getLogger(appName).severe(message); } public static void logSevere(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.SEVERE,message,thrown); } + public static void isSevere(String appName) { + Logger.getLogger(appName).isLoggable(Level.SEVERE); + } + public static void logWarning(String appName, String message) { Logger.getLogger(appName).warning(message); } public static void logWarning(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.WARNING,message,thrown); } + public static void isWarning(String appName) { + Logger.getLogger(appName).isLoggable(Level.WARNING); + } + public static void logConfig(String appName, String message) { Logger.getLogger(appName).config(message); } public static void logConfig(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.CONFIG,message,thrown); } + public static void isConfig(String appName) { + Logger.getLogger(appName).isLoggable(Level.CONFIG); + } + public static void logInfo(String appName, String message) { Logger.getLogger(appName).info(message); } public static void logInfo(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.INFO,message,thrown); } + public static void isInfo(String appName) { + Logger.getLogger(appName).isLoggable(Level.INFO); + } + public static void logFine(String appName, String message) { Logger.getLogger(appName).fine(message); } public static void logFine(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.FINE,message,thrown); } + public static void isFine(String appName) { + Logger.getLogger(appName).isLoggable(Level.FINE); + } public static void logFiner(String appName, String message) { Logger.getLogger(appName).finer(message); @@ -151,6 +180,9 @@ public final class serverLog { public static void logFiner(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.FINER,message,thrown); } + public static void isFiner(String appName) { + Logger.getLogger(appName).isLoggable(Level.FINER); + } public static void logFinest(String appName, String message) { Logger.getLogger(appName).finest(message); @@ -158,6 +190,9 @@ public final class serverLog { public static void logFinest(String appName, String message, Throwable thrown) { Logger.getLogger(appName).log(Level.FINEST,message,thrown); } + public static void isFinest(String appName) { + Logger.getLogger(appName).isLoggable(Level.FINEST); + } public static final void configureLogging(File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException { FileInputStream fileIn = null;