diff --git a/htroot/Settings_Parser.inc b/htroot/Settings_Parser.inc
index 5b2a38a62..2953b68cd 100644
--- a/htroot/Settings_Parser.inc
+++ b/htroot/Settings_Parser.inc
@@ -7,23 +7,30 @@ For a detailed description of the various MIME-types take a look at
Activate
Mime-Type
-
Parser Class Name
+
Parser Usage
#{parser}#
-
-
#[mime]#
-
#[shortname]#
+
#[name]# V#[version]#
+
#[usage]#
+
+
+#{mime}#
+
+
+
#[mimetype]#
+
-
+
+#{/mime}#
#{/parser}#
-
+
Enable all parsers
-
+
Changes take effect immediately
diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java
index 1ed1b2643..af3735ee5 100644
--- a/htroot/Settings_p.java
+++ b/htroot/Settings_p.java
@@ -45,13 +45,16 @@
import java.util.Arrays;
import java.util.Collections;
+import java.util.Enumeration;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@@ -249,29 +252,42 @@ public final class Settings_p {
* Parser Configuration
*/
plasmaSwitchboard sb = (plasmaSwitchboard)env;
- Hashtable enabledParsers = sb.parser.getEnabledParserList();
- Hashtable availableParsers = sb.parser.getAvailableParserList();
+ HashSet enabledParsers = sb.parser.getEnabledParserList();
+ HashSet parserInfos = new HashSet(sb.parser.getAvailableParserList().values());
- // fetching a list of all available mimetypes
- List availableParserKeys = Arrays.asList(availableParsers.keySet().toArray(new String[availableParsers.size()]));
-
- // sort it
- Collections.sort(availableParserKeys);
+// // fetching a list of all available mimetypes
+// List availableParserKeys = Arrays.asList(availableParsers.entrySet().toArray(new ParserInfo[availableParsers.size()]));
+//
+// // sort it
+// Collections.sort(availableParserKeys);
// loop through the mimeTypes and add it to the properties
boolean allParsersEnabled = true;
int parserIdx = 0;
- Iterator availableParserIter = availableParserKeys.iterator();
+
+ Iterator availableParserIter = parserInfos.iterator();
while (availableParserIter.hasNext()) {
- String mimeType = (String) availableParserIter.next();
- String parserName = (String) availableParsers.get(mimeType);
- boolean parserIsEnabled = enabledParsers.containsKey(mimeType);
+ ParserInfo parserInfo = (ParserInfo) availableParserIter.next();
+ prop.put("parser_" + parserIdx + "_name", parserInfo.parserName);
+ prop.put("parser_" + parserIdx + "_version", parserInfo.parserVersionNr);
+ prop.put("parser_" + parserIdx + "_usage", Integer.toString(parserInfo.usageCount));
- prop.put("parser_" + parserIdx + "_mime", mimeType);
- prop.put("parser_" + parserIdx + "_name", parserName);
- prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1));
- prop.put("parser_" + parserIdx + "_status", parserIsEnabled ? 1:0);
- allParsersEnabled &= parserIsEnabled;
+ int mimeIdx = 0;
+ Enumeration mimeTypeIter = parserInfo.supportedMimeTypes.keys();
+ while (mimeTypeIter.hasMoreElements()) {
+ String mimeType = (String)mimeTypeIter.nextElement();
+
+ boolean parserIsEnabled = enabledParsers.contains(mimeType);
+
+ prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
+ //prop.put("parser_" + parserIdx + "_name", parserName);
+ //prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1));
+ prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", enabledParsers.contains(mimeType) ? 1:0);
+ allParsersEnabled &= parserIsEnabled;
+
+ mimeIdx++;
+ }
+ prop.put("parser_" + parserIdx + "_mime", mimeIdx);
parserIdx++;
}
diff --git a/httpd.mime b/httpd.mime
index 1327ce1d5..157655749 100644
--- a/httpd.mime
+++ b/httpd.mime
@@ -37,6 +37,7 @@ mov = video/quicktime
mpe = video/mpeg
mpeg = video/mpeg
mpg = video/mpeg
+odt = application/vnd.oasis.opendocument.text
ogg = audio/ogg-vorbis
pac = application/x-ns-proxy-autoconfig
pdf = application/pdf
@@ -70,6 +71,7 @@ tif = image/tiff
tiff = image/tiff
torrent = application/x-bittorrent
txt = text/plain
+vcf = text/x-vcard
wav = audio/x-wav
xhtml = application/xhtml+xml
xla = application/msexcel
diff --git a/libx/jmimemagic-0.0.4a.jar b/libx/jmimemagic-0.0.4a.jar
index b3cbfea22..66d420782 100644
Binary files a/libx/jmimemagic-0.0.4a.jar and b/libx/jmimemagic-0.0.4a.jar differ
diff --git a/libx/odf_utils_05_11_10.jar b/libx/odf_utils_05_11_10.jar
new file mode 100644
index 000000000..4e7a174f3
Binary files /dev/null and b/libx/odf_utils_05_11_10.jar differ
diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java
index 5860a3da1..3eeb80df4 100644
--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@@ -346,7 +346,7 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
int argc;
if (argsString == null) {
// no args here, maybe a POST with multipart extension
- int length;
+ int length = 0;
//System.out.println("HEADER: " + requestHeader.toString()); // DEBUG
if (method.equals(httpHeader.METHOD_POST)) {
@@ -356,10 +356,11 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
} else if (requestHeader.gzip()) {
length = -1;
gzipBody = new GZIPInputStream(body);
- } else {
- httpd.sendRespondError(conProp,out,4,403,null,"bad post values",null);
- return;
}
+// } else {
+// httpd.sendRespondError(conProp,out,4,403,null,"bad post values",null);
+// return;
+// }
// if its a POST, it can be either multipart or as args in the body
if ((requestHeader.containsKey(httpHeader.CONTENT_TYPE)) &&
@@ -438,7 +439,7 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
}
}else{
//you cannot share a .png/.gif file with a name like a class in htroot.
- if ( !(targetFile.exists()) && !((path.endsWith("png")||path.endsWith("gif"))&&targetClass!=null ) ){
+ if ( !(targetFile.exists()) && !((path.endsWith("png")||path.endsWith("gif")||path.endsWith(".stream"))&&targetClass!=null ) ){
targetFile = new File(htDocsPath, path);
targetClass = rewriteClassFile(new File(htDocsPath, path));
}
@@ -486,6 +487,20 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
Thread.currentThread().sleep(200); // see below
serverFileUtils.write(result, out);
}
+ } else if ((targetClass != null) && (path.endsWith(".stream"))) {
+ // call rewrite-class
+ requestHeader.put("CLIENTIP", conProp.getProperty("CLIENTIP"));
+ requestHeader.put("PATH", path);
+ requestHeader.put("INPUTSTREAM", body);
+ requestHeader.put("OUTPUTSTREAM", out);
+
+ httpd.sendRespondHeader(this.connectionProperties, out, httpVersion, 200, null);
+
+ // in case that there are no args given, args = null or empty hashmap
+ serverObjects tp = (serverObjects) rewriteMethod(targetClass).invoke(null, new Object[] {requestHeader, args, switchboard});
+
+ this.forceConnectionClose();
+ return;
} else if ((targetFile.exists()) && (targetFile.canRead())) {
// we have found a file that can be written to the client
// if this file uses templates, then we use the template
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index f7ed3ab35..e7defd382 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -73,6 +73,16 @@ public abstract class AbstractParser implements Parser{
* purposes.
*/
protected serverLog theLogger = null;
+
+ /**
+ * Version number of the parser
+ */
+ protected String parserVersionNr = "0.1";
+
+ /**
+ * Parser name
+ */
+ protected String parserName = this.getClass().getSimpleName();
/**
* The Constructor of this class.
@@ -165,4 +175,18 @@ public abstract class AbstractParser implements Parser{
this.theLogger = log;
}
+ /**
+ * Returns the version number of the parser
+ * @return parser version number
+ */
+ public String getVersion() {
+ return this.parserVersionNr;
+ }
+
+ /**
+ * Return the name of the parser
+ */
+ public String getName() {
+ return parserName;
+ }
}
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index 038b88505..dd3875d2a 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -122,4 +122,17 @@ public interface Parser {
*/
public void setLogger(serverLog log);
+ /**
+ * Returns the version number of the current parser
+ * @return parser version number
+ */
+ public String getVersion();
+
+ /**
+ * Returns the name of the parser
+ * @return parser name
+ */
+ public String getName();
}
+
+
diff --git a/source/de/anomic/plasma/parser/ParserInfo.java b/source/de/anomic/plasma/parser/ParserInfo.java
new file mode 100644
index 000000000..a7e68663a
--- /dev/null
+++ b/source/de/anomic/plasma/parser/ParserInfo.java
@@ -0,0 +1,34 @@
+package de.anomic.plasma.parser;
+
+import java.util.Hashtable;
+
+public class ParserInfo {
+ // general parser info
+ public Class parserClass;
+ public String parserClassName;
+
+ public String parserName;
+ public String parserVersionNr;
+
+ // parser properties
+ public String[] libxDependencies;
+ public Hashtable supportedMimeTypes;
+
+ // usage statistic
+ public int usageCount = 0;
+
+ public String toString() {
+ StringBuffer toStr = new StringBuffer();
+
+ toStr.append(this.parserName).append(" V")
+ .append((this.parserVersionNr==null)?"0.0":this.parserVersionNr).append(" | ")
+ .append(this.parserClassName).append(" | ")
+ .append(this.supportedMimeTypes);
+
+ return toStr.toString();
+ }
+
+ public synchronized void incUsageCounter() {
+ this.usageCount++;
+ }
+}
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index e9e378b81..887932f5e 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -77,9 +77,10 @@ public class bzipParser extends AbstractParser implements Parser {
private static final String[] LIBX_DEPENDENCIES = new String[] {
"bzip2.jar"
};
-
+
public bzipParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Bzip 2 UNIX Compressed File Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -105,7 +106,6 @@ public class bzipParser extends AbstractParser implements Parser {
int read = 0;
byte[] data = new byte[1024];
-
CBZip2InputStream zippedContent = new CBZip2InputStream(source);
tempFile = File.createTempFile("bunzip","tmp");
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index 0ed57ad5f..c339ad006 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -75,6 +75,7 @@ implements Parser {
public docParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Word Document Parser";
}
public plasmaParserDocument parse(URL location, String mimeType,
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index f33993f1f..c3028d561 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -76,6 +76,7 @@ public class gzipParser extends AbstractParser implements Parser {
public gzipParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "GNU Zip Compressed Archive Parser";
}
public Hashtable getSupportedMimeTypes() {
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index 910ac7f3c..6f2f73454 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -47,6 +47,7 @@ import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
+import java.util.HashMap;
import java.util.Hashtable;
import org.apache.log4j.Level;
@@ -74,7 +75,10 @@ implements Parser {
static {
SUPPORTED_MIME_TYPES.put("text/xml","xml");
SUPPORTED_MIME_TYPES.put("application/xml","xml");
- SUPPORTED_MIME_TYPES.put("application/octet-stream","");
+ SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
+ SUPPORTED_MIME_TYPES.put("application/octet-stream","");
+ SUPPORTED_MIME_TYPES.put("application/x-compress","");
+ SUPPORTED_MIME_TYPES.put("application/x-compressed","");
}
/**
@@ -88,14 +92,56 @@ implements Parser {
"xerces.jar"
};
+ /**
+ * Helping structure used to detect loops in the mimeType detection
+ * process
+ */
+ private static Hashtable threadLoopDetection = new Hashtable();
+
public mimeTypeParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "MimeType Parser";
+ }
+
+ public String getMimeType (File sourceFile) {
+ String mimeType = null;
+
+ try {
+ Magic theMagic = new Magic();
+ MagicMatch match = theMagic.getMagicMatch(sourceFile);
+
+ // if a match was found we can return the new mimeType
+ if (match!=null) {
+ Collection subMatches = match.getSubMatches();
+ if ((subMatches != null) && (!subMatches.isEmpty())) {
+ mimeType = ((MagicMatch) subMatches.iterator().next()).getMimeType();
+ } else {
+ mimeType = match.getMimeType();
+ }
+ return mimeType;
+ }
+ } catch (Exception e) {
+
+ }
+ return null;
}
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
+ String orgMimeType = mimeType;
+
// determining the mime type of the file ...
try {
+ // adding current thread to loop detection list
+ Integer loopDepth = null;
+ if (threadLoopDetection.containsKey(Thread.currentThread())) {
+ loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread());
+ } else {
+ loopDepth = new Integer(0);
+ }
+ if (loopDepth.intValue() > 5) return null;
+ threadLoopDetection.put(Thread.currentThread(),new Integer(loopDepth.intValue()+1));
+
// deactivating the logging for jMimeMagic
Logger theLogger = Logger.getLogger("net.sf.jmimemagic");
theLogger.setLevel(Level.OFF);
@@ -115,6 +161,7 @@ implements Parser {
// to avoid loops we have to test if the mimetype has changed ...
if (this.getSupportedMimeTypes().containsKey(mimeType)) return null;
+ if (orgMimeType.equals(mimeType)) return null;
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,sourceFile);
@@ -123,6 +170,13 @@ implements Parser {
} catch (Exception e) {
return null;
+ } finally {
+ Integer loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread());
+ if (loopDepth.intValue() <= 1) {
+ threadLoopDetection.remove(Thread.currentThread());
+ } else {
+ threadLoopDetection.put(Thread.currentThread(), new Integer(loopDepth.intValue()-1));
+ }
}
}
diff --git a/source/de/anomic/plasma/parser/odt/build.xml b/source/de/anomic/plasma/parser/odt/build.xml
new file mode 100644
index 000000000..25ae3e5d5
--- /dev/null
+++ b/source/de/anomic/plasma/parser/odt/build.xml
@@ -0,0 +1,55 @@
+
+
+
+ A class to parse gzip files
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
new file mode 100644
index 000000000..3b9708472
--- /dev/null
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -0,0 +1,214 @@
+//zipParser.java
+//------------------------
+//part of YaCy
+//(C) by Michael Peter Christen; mc@anomic.de
+//first published on http://www.anomic.de
+//Frankfurt, Germany, 2005
+//
+//this file is contributed by Martin Thelian
+//last major change: 16.05.2005
+//
+//This program is free software; you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation; either version 2 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program; if not, write to the Free Software
+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+//Using this software in any meaning (reading, learning, copying, compiling,
+//running) means that you agree that the Author(s) is (are) not responsible
+//for cost, loss of data or any harm that may be caused directly or indirectly
+//by usage of this softare or this documentation. The usage of this software
+//is on your own risk. The installation and usage (starting/running) of this
+//software may allow other people or application to access your computer and
+//any attached devices and is highly dependent on the configuration of the
+//software which must be done by the user of the software; the author(s) is
+//(are) also not responsible for proper configuration and usage of the
+//software, even if provoked by documentation provided together with
+//the software.
+//
+//Any changes to this file according to the GPL as documented in the file
+//gpl.txt aside this file in the shipment you received can be done to the
+//lines that follows this copyright notice here, but changes must not be
+//done inside the copyright notive above. A re-distribution must contain
+//the intact and unchanged copyright notice.
+//Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.odt;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import com.catcode.odf.ODFMetaFileAnalyzer;
+import com.catcode.odf.OpenDocumentMetadata;
+import com.catcode.odf.OpenDocumentTextInputStream;
+
+import de.anomic.http.httpc;
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.AbstractParser;
+import de.anomic.plasma.parser.Parser;
+import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverFileUtils;
+import de.anomic.server.logging.serverLog;
+
+public class odtParser extends AbstractParser implements Parser {
+
+ /**
+ * a list of mime types that are supported by this parser class
+ * @see #getSupportedMimeTypes()
+ */
+ public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ static {
+ SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt");
+ SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
+ }
+
+ /**
+ * a list of library names that are needed by this parser
+ * @see Parser#getLibxDependences()
+ */
+ private static final String[] LIBX_DEPENDENCIES = new String[] {"odf_utils_05_11_10.jar"};
+
+ public odtParser() {
+ super(LIBX_DEPENDENCIES);
+ parserName = "OASIS OpenDocument V2 Text Document Parser";
+ }
+
+ public Hashtable getSupportedMimeTypes() {
+ return SUPPORTED_MIME_TYPES;
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException {
+
+ try {
+ byte[] docContent = null;
+ String docDescription = null;
+ String docKeywords = null;
+ String docShortTitle = null;
+ String docLongTitle = null;
+
+ // opening the file as zip file
+ ZipFile zipFile= new ZipFile(dest);
+ Enumeration zipEnum = zipFile.entries();
+
+ // looping through all containing files
+ while (zipEnum.hasMoreElements()) {
+ ZipEntry zipEntry= (ZipEntry) zipEnum.nextElement();
+ String entryName = zipEntry.getName();
+
+ // content.xml contains the document content in xml format
+ if (entryName.equals("content.xml")) {
+ InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
+ OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream);
+ docContent = serverFileUtils.read(odStream);
+
+ // meta.xml contains metadata about the document
+ } else if (entryName.equals("meta.xml")) {
+ InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
+ ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
+ OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
+ docDescription = metaData.getDescription();
+ docKeywords = metaData.getKeyword();
+ docShortTitle = metaData.getTitle();
+ docLongTitle = metaData.getSubject();
+
+ // if there is no title availabe we generate one
+ if (docLongTitle == null) {
+ if (docShortTitle != null) {
+ docLongTitle = docShortTitle;
+ } else if (docContent.length <= 80) {
+ docLongTitle = new String(docContent);
+ } else {
+ byte[] title = new byte[80];
+ System.arraycopy(docContent, 0, title, 0, 80);
+ docLongTitle = new String(title);
+ }
+ docLongTitle.
+ replaceAll("\r\n"," ").
+ replaceAll("\n"," ").
+ replaceAll("\r"," ").
+ replaceAll("\t"," ");
+ }
+ }
+ }
+
+ return new plasmaParserDocument(
+ location,
+ mimeType,
+ docKeywords,
+ docShortTitle,
+ docLongTitle,
+ null,
+ docDescription,
+ docContent,
+ null,
+ null);
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new ParserException("Unable to parse the odt content. " + e.getMessage());
+ } catch (Error e) {
+ throw new ParserException("Unable to parse the odt content. " + e.getMessage());
+ }
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+ File dest = null;
+ try {
+ // creating a tempfile
+ dest = File.createTempFile("OpenDocument", ".odt");
+ dest.deleteOnExit();
+
+ // copying the stream into a file
+ serverFileUtils.copy(source, dest);
+
+ // parsing the content
+ return parse(location, mimeType, dest);
+ } catch (Exception e) {
+ throw new ParserException("Unable to parse the odt document. " + e.getMessage());
+ } finally {
+ if (dest != null) try { dest.delete(); } catch (Exception e){}
+ }
+ }
+
+ public void reset() {
+ // Nothing todo here at the moment
+
+ }
+
+ public static void main(String[] args) {
+ try {
+ if (args.length != 1) return;
+
+ // getting the content URL
+ URL contentUrl = new URL(args[0]);
+
+ // creating a new parser
+ odtParser testParser = new odtParser();
+
+ // setting the parser logger
+ testParser.setLogger(new serverLog("PARSER.ODT"));
+
+ // downloading the document content
+ byte[] content = httpc.singleGET(contentUrl, 10000, null, null, null);
+ ByteArrayInputStream input = new ByteArrayInputStream(content);
+
+ // parsing the document
+ testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 70db05b45..706d41413 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -74,10 +74,11 @@ public class pdfParser extends AbstractParser implements Parser {
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"PDFBox-0.7.2.jar"
- };
+ };
public pdfParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Acrobat Portable Document Parser";
}
public Hashtable getSupportedMimeTypes() {
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index dd2d8121c..1197cafa8 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -91,10 +91,11 @@ public class rssParser extends AbstractParser implements Parser {
"informa-0.6.0.jar",
"commons-logging.jar",
"jdom.jar"
- };
+ };
public rssParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Rich Site Summary/Atom Feed Parser";
}
public plasmaParserDocument parse(URL location, String mimeType,
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 329605cce..667fc5285 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -73,10 +73,11 @@ implements Parser {
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
- private static final String[] LIBX_DEPENDENCIES = new String[] {};
+ private static final String[] LIBX_DEPENDENCIES = new String[] {};
public rtfParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Rich Text Format Parser";
}
public plasmaParserDocument parse(URL location, String mimeType,
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 71dafb99f..10d512425 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -44,13 +44,16 @@
package de.anomic.plasma.parser.tar;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.InputStream;
+import java.io.PushbackInputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
+import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
@@ -61,6 +64,7 @@ import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverFileUtils;
public class tarParser extends AbstractParser implements Parser {
@@ -71,6 +75,7 @@ public class tarParser extends AbstractParser implements Parser {
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static {
SUPPORTED_MIME_TYPES.put("application/x-tar","tar");
+ SUPPORTED_MIME_TYPES.put("application/tar","tar");
}
/**
@@ -83,6 +88,7 @@ public class tarParser extends AbstractParser implements Parser {
public tarParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Tape Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -92,6 +98,18 @@ public class tarParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
try {
+ // creating a new parser class to parse the unzipped content
+ plasmaParser theParser = new plasmaParser();
+
+ /*
+ * If the mimeType was not reported correcly by the webserve we
+ * have to decompress it first
+ */
+ String ext = plasmaParser.getFileExt(location).toLowerCase();
+ if (ext.equals("gz") || ext.equals("tgz")) {
+ source = new GZIPInputStream(source);
+ }
+
StringBuffer docKeywords = new StringBuffer();
StringBuffer docShortTitle = new StringBuffer();
StringBuffer docLongTitle = new StringBuffer();
@@ -100,11 +118,7 @@ public class tarParser extends AbstractParser implements Parser {
serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
Map docImages = new HashMap();
-
-
- // creating a new parser class to parse the unzipped content
- plasmaParser theParser = new plasmaParser();
-
+
// looping through the contained files
TarEntry entry;
TarInputStream tin = new TarInputStream(source);
@@ -113,22 +127,34 @@ public class tarParser extends AbstractParser implements Parser {
if (entry.isDirectory()) continue;
// Get the entry name
- String entryName = entry.getName();
- int idx = entryName.lastIndexOf(".");
- String entryExt = (idx > -1) ? entryName.substring(idx+1) : null;
+ int idx = -1;
+ String entryName = entry.getName();
+ idx = entryName.lastIndexOf("/");
+ if (idx != -1) entryName = entryName.substring(idx+1);
+ idx = entryName.lastIndexOf(".");
+ String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// getting the entry content
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- byte[] buf = new byte[(int) entry.getSize()];
- int bytesRead = tin.read(buf);
- bos.write(buf);
- byte[] ut = bos.toByteArray();
-
- // parsing the content
- plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut);
+ plasmaParserDocument theDoc = null;
+ File tempFile = null;
+ try {
+
+
+ byte[] buf = new byte[(int) entry.getSize()];
+ int bytesRead = tin.read(buf);
+
+ tempFile = File.createTempFile("tarParser_" + ((idx>-1)?entryName.substring(0,idx):entryName), (entryExt.length()>0)?"."+entryExt:entryExt);
+ serverFileUtils.write(buf, tempFile);
+
+ // parsing the content
+
+ theDoc = theParser.parseSource(tempFile.toURL(),entryMime,tempFile);
+ } finally {
+ if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
+ }
if (theDoc == null) continue;
// merging all documents together
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index 0e1fe5b61..c46771f67 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -86,10 +86,11 @@ public class vcfParser extends AbstractParser implements Parser {
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
- private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"};
+ private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"};
public vcfParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "vCard Parser";
}
public Hashtable getSupportedMimeTypes() {
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index fa5cd2388..6372eec0d 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -79,10 +79,11 @@ public class zipParser extends AbstractParser implements Parser {
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
- private static final String[] LIBX_DEPENDENCIES = new String[] {};
+ private static final String[] LIBX_DEPENDENCIES = new String[] {};
public zipParser() {
super(LIBX_DEPENDENCIES);
+ parserName = "Compressed Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index e1551aaaf..41f8306ee 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -46,7 +46,6 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
@@ -70,8 +69,10 @@ import java.util.Set;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
+import de.anomic.http.httpc;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
+import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
@@ -92,7 +93,7 @@ public final class plasmaParser {
* @see #loadEnabledParserList()
* @see #setEnabledParserList(Enumeration)
*/
- private static final Properties enabledParserList = new Properties();
+ private static final HashSet enabledParserList = new HashSet();
/**
* A list of file extensions that are supported by all enabled parsers
@@ -104,12 +105,43 @@ public final class plasmaParser {
* be parsed in realtime.
*/
private static final HashSet supportedRealtimeFileExt = new HashSet();
+
+ /**
+ * A list of mimeTypes that are generic
+ */
+ private static final HashSet genericMimeTypes = new HashSet();
+ static {
+ genericMimeTypes.add("text/plain");
+ genericMimeTypes.add("text/text");
+ genericMimeTypes.add("text/xml");
+ genericMimeTypes.add("application/xml");
+ genericMimeTypes.add("application/x-xml");
+ genericMimeTypes.add("application/octet-stream");
+ genericMimeTypes.add("application/zip");
+ genericMimeTypes.add("application/x-zip");
+ genericMimeTypes.add("application/x-zip-compressed");
+ genericMimeTypes.add("application/x-compress");
+ genericMimeTypes.add("application/x-compressed");
+ }
/**
* A list of mimeTypes that can be parsed in Realtime (on the fly)
*/
private static final HashSet realtimeParsableMimeTypes = new HashSet();
+ private static final Properties mimeTypeLookupByFileExt = new Properties();
+ static {
+ // loading a list of extensions from file
+ BufferedInputStream bufferedIn = null;
+ try {
+ mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
+ } catch (IOException e) {
+ System.err.println("ERROR: httpd.mime not found in settings path");
+ } finally {
+ if (bufferedIn != null) try{bufferedIn.close();}catch(Exception e){}
+ }
+ }
+
/**
* A pool of parsers.
* @see plasmaParserPool
@@ -162,7 +194,7 @@ public final class plasmaParser {
// The maximum number of idle connections connections in the pool
// 0 = no limit.
- config.maxIdle = 10;
+ config.maxIdle = 5;
config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK;
config.minEvictableIdleTimeMillis = 30000;
@@ -175,6 +207,8 @@ public final class plasmaParser {
loadAvailableParserList();
}
+ private serverLog theLogger = new serverLog("PARSER");
+
/**
* This function is used to initialize the realtimeParsableMimeTypes List.
* This list contains a list of mimeTypes that can be parsed in realtime by
@@ -185,7 +219,7 @@ public final class plasmaParser {
public static void initRealtimeParsableMimeTypes(String realtimeParsableMimeTypes) {
LinkedList mimeTypes = new LinkedList();
if ((realtimeParsableMimeTypes == null) || (realtimeParsableMimeTypes.length() == 0)) {
-
+ // Nothing todo here
} else {
String[] realtimeParsableMimeTypeList = realtimeParsableMimeTypes.split(",");
for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
@@ -280,7 +314,7 @@ public final class plasmaParser {
}
synchronized (enabledParserList) {
- return enabledParserList.containsKey(mimeType);
+ return enabledParserList.contains(mimeType);
}
}
@@ -302,7 +336,7 @@ public final class plasmaParser {
// termining last position of . in file path
p = name.lastIndexOf('.');
- if (p < 0) return name; // seams to be strange, but this is a directory entry or default file (html)
+ if (p < 0) return "";
return name.substring(p + 1);
}
@@ -352,19 +386,8 @@ public final class plasmaParser {
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
- public static String getMimeTypeByFileExt(String fileExt) {
- // loading a list of extensions from file
- Properties prop = new Properties();
- BufferedInputStream bufferedIn = null;
- try {
- prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
- } catch (IOException e) {
- System.err.println("ERROR: httpd.mime not found in settings path");
- } finally {
- if (bufferedIn != null) try{bufferedIn.close();}catch(Exception e){}
- }
-
- return prop.getProperty(fileExt,"application/octet-stream");
+ public static String getMimeTypeByFileExt(String fileExt) {
+ return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream");
}
public plasmaParser() {
@@ -373,7 +396,7 @@ public final class plasmaParser {
public static String[] setEnabledParserList(Set mimeTypeSet) {
- Properties newEnabledParsers = new Properties();
+ HashSet newEnabledParsers = new HashSet();
HashSet newSupportedFileExt = new HashSet();
if (mimeTypeSet != null) {
@@ -384,7 +407,7 @@ public final class plasmaParser {
Parser theParser = null;
try {
// getting the parser
- theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType));
+ theParser = (Parser) plasmaParser.theParserPool.borrowObject(((ParserInfo)availableParserList.get(mimeType)).parserClassName);
// getting a list of mimeTypes that the parser supports
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
@@ -397,7 +420,7 @@ public final class plasmaParser {
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
- newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
+ newEnabledParsers.add(mimeType);
} catch (Exception e) {
serverLog.logSevere("PARSER", "error in setEnabledParserList", e);
@@ -411,7 +434,7 @@ public final class plasmaParser {
synchronized (enabledParserList) {
enabledParserList.clear();
- enabledParserList.putAll(newEnabledParsers);
+ enabledParserList.addAll(newEnabledParsers);
}
@@ -420,34 +443,18 @@ public final class plasmaParser {
supportedFileExt.addAll(newSupportedFileExt);
}
- return (String[])newEnabledParsers.keySet().toArray(new String[newEnabledParsers.size()]);
+ return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
- public Hashtable getEnabledParserList() {
+ public HashSet getEnabledParserList() {
synchronized (plasmaParser.enabledParserList) {
- return (Hashtable) plasmaParser.enabledParserList.clone();
+ return (HashSet) plasmaParser.enabledParserList.clone();
}
}
public Hashtable getAvailableParserList() {
return plasmaParser.availableParserList;
- }
-
- private static void loadEnabledParserList() {
- // loading a list of availabe parser from file
- Properties prop = new Properties();
- BufferedInputStream bufferedIn = null;
- try {
- prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
- } catch (IOException e) {
- System.err.println("ERROR: yacy.parser not found in settings path");
- } finally {
- if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){}
- }
-
- // enable them ...
- setEnabledParserList(prop.keySet());
- }
+ }
private static void loadAvailableParserList() {
try {
@@ -474,9 +481,11 @@ public final class plasmaParser {
*/
File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter);
if (parserDirectories == null) return;
+
for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
File currentDir = parserDirectories[parserDirNr];
serverLog.logFine("PARSER", "Searching in directory " + currentDir.toString());
+
String[] parserClasses = currentDir.list(parserFileNameFilter);
if (parserClasses == null) continue;
@@ -506,12 +515,25 @@ public final class plasmaParser {
// loading the list of mime-types that are supported by this parser class
Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes();
+
+ // creating a parser info object
+ ParserInfo parserInfo = new ParserInfo();
+ parserInfo.parserClass = parserClass;
+ parserInfo.parserClassName = fullClassName;
+ parserInfo.libxDependencies = neededLibx;
+ parserInfo.supportedMimeTypes = supportedMimeTypes;
+ parserInfo.parserVersionNr = ((Parser)theParser).getVersion();
+ parserInfo.parserName = ((Parser)theParser).getName();
+
Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator();
while (mimeTypeIterator.hasNext()) {
String mimeType = (String) mimeTypeIterator.next();
- availableParserList.put(mimeType,fullClassName);
+ availableParserList.put(mimeType,parserInfo );
serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'." +
- ((neededLibxBuf.length()>0)?"\n Dependencies: " + neededLibxBuf.toString():""));
+ "\n\tName: " + parserInfo.parserName +
+ "\n\tVersion: " + parserInfo.parserVersionNr +
+ "\n\tClass: " + parserInfo.parserClassName +
+ ((neededLibxBuf.length()>0)?"\n\tDependencies: " + neededLibxBuf.toString():""));
}
} catch (Exception e) { /* we can ignore this for the moment */
@@ -537,50 +559,19 @@ public final class plasmaParser {
try {
theParserPool.close();
} catch (Exception e) { }
- }
+ }
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
-
- Parser theParser = null;
+ File tempFile = null;
try {
- mimeType = getRealMimeType(mimeType);
- String fileExt = getFileExt(location);
-
- // TODO: Handling of not trustable mimeTypes
- // text/plain, octet-stream
- if (
- (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
- (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
- ) {
- if (enabledParserList.containsKey("application/octet-stream")) {
- mimeType = "application/octet-stream";
- }
- }
-
- // getting the correct parser for the given mimeType
- theParser = this.getParser(mimeType);
-
- // if a parser was found we use it ...
- if (theParser != null) {
- return theParser.parse(location, mimeType,source);
- } else if (realtimeParsableMimeTypesContains(mimeType)) {
- // ... otherwise we make a html scraper and transformer
- htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
- OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
- hfos.write(source);
- hfos.close();
- return transformScraper(location, mimeType, scraper);
- } else {
- return null;
- }
- } catch (Exception e) {
- //e.printStackTrace();
+ tempFile = File.createTempFile("parseSource", ".tmp");
+ return parseSource(location, mimeType, tempFile);
+ } catch (Exception e) {
return null;
} finally {
- if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
- try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) {}
- }
+ if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){}
}
+
}
public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) {
@@ -590,16 +581,58 @@ public final class plasmaParser {
mimeType = getRealMimeType(mimeType);
String fileExt = getFileExt(location);
- // TODO: Handling of not trustable mimeTypes
- // text/plain, octet-stream
- if (
- (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
- (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
- ) {
- if (enabledParserList.containsKey("application/octet-stream")) {
- mimeType = "application/octet-stream";
- }
- }
+ if (this.theLogger.isFine())
+ this.theLogger.logFine("Parsing " + location + " with mimeType '" + mimeType +
+ "' and file extension '" + fileExt + "'.");
+
+ /*
+ * There are some problematic mimeType - fileExtension combination where we have to enforce
+ * a mimeType detection to get the proper parser for the content
+ *
+ * - application/zip + .odt
+ * - text/plain + .odt
+ * - text/plain + .vcf
+ * - text/xml + .rss
+ * - text/xml + .atom
+ *
+ * In all these cases we can trust the fileExtension and have to determine the proper mimeType.
+ *
+ */
+
+// // Handling of not trustable mimeTypes
+// // - text/plain
+// // - text/xml
+// // - application/octet-stream
+// // - application/zip
+// if (
+// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
+// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
+// ) {
+// if (this.theLogger.isFine())
+// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType +
+// "' that seems not to be correct for file extension '" + fileExt + "'.");
+//
+// if (enabledParserList.containsKey("application/octet-stream")) {
+// theParser = this.getParser("application/octet-stream");
+// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile);
+// if (newMime == null)
+// if (newMime instanceof String) {
+// String newMimeType = (String)newMime;
+// if ((newMimeType.equals("application/octet-stream")) {
+// return null;
+// }
+// mimeType = newMimeType;
+// }
+// } else {
+// return null;
+// }
+// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){
+// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) {
+// mimeType = "application/vnd.oasis.opendocument.text";
+// } else {
+// return null;
+// }
+// }
// getting the correct parser for the given mimeType
theParser = this.getParser(mimeType);
@@ -647,16 +680,18 @@ public final class plasmaParser {
* @param mimeType
* @return
*/
- public Parser getParser(String mimeType) {
+ private Parser getParser(String mimeType) {
mimeType = getRealMimeType(mimeType);
try {
// determining the proper parser class name for the mimeType
String parserClassName = null;
+ ParserInfo parserInfo = null;
synchronized (plasmaParser.enabledParserList) {
- if (plasmaParser.enabledParserList.containsKey(mimeType)) {
- parserClassName = (String)plasmaParser.enabledParserList.get(mimeType);
+ if (plasmaParser.enabledParserList.contains(mimeType)) {
+ parserInfo = (ParserInfo)plasmaParser.availableParserList.get(mimeType);
+ parserClassName = parserInfo.parserClassName;
} else {
return null;
}
@@ -668,6 +703,7 @@ public final class plasmaParser {
// checking if the created parser really supports the given mimetype
Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes();
if ((supportedMimeTypes != null) && (supportedMimeTypes.containsKey(mimeType))) {
+ parserInfo.incUsageCounter();
return theParser;
}
theParserPool.returnObject(parserClassName,theParser);
@@ -740,10 +776,40 @@ public final class plasmaParser {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try {
- File in = new File(args[0]);
- //File out = new File(args[1]);
+ File contentFile = null;
+ URL contentURL = null;
+ String contentMimeType = "application/octet-stream";
+
+ if (args.length < 2) {
+ System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
+ }
+
+ String mode = args[0];
+ if (mode.equalsIgnoreCase("-f")) {
+ contentFile = new File(args[1]);
+ contentURL = contentFile.toURL();
+ } else if (mode.equalsIgnoreCase("-u")) {
+ contentURL = new URL(args[1]);
+
+ // downloading the document content
+ byte[] contentBytes = httpc.singleGET(contentURL, 10000, null, null, null);
+
+ contentFile = File.createTempFile("content",".tmp");
+ contentFile.deleteOnExit();
+ serverFileUtils.write(contentBytes, contentFile);
+ }
+
+ if ((args.length == 4)&&(args[2].equalsIgnoreCase("-m"))) {
+ contentMimeType = args[3];
+ }
+
+ // creating a plasma parser
plasmaParser theParser = new plasmaParser();
+
+ // configuring the realtime parsable mimeTypes
plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
+
+ // configure all other supported mimeTypes
plasmaParser.initParseableMimeTypes(
"application/atom+xml," +
"application/gzip," +
@@ -763,14 +829,14 @@ public final class plasmaParser {
"text/xml," +
"application/x-bzip2," +
"application/postscript," +
- "text/x-vcard");
- FileInputStream theInput = new FileInputStream(in);
- ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
- serverFileUtils.copy(theInput, theOutput);
- plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.ps"), null, theOutput.toByteArray());
- //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
- //byte[] theText = document.getText();
- //serverFileUtils.write(theText, out);
+ "text/x-vcard," +
+ "application/vnd.oasis.opendocument.text," +
+ "application/x-vnd.oasis.opendocument.text");
+
+ // parsing the content
+ plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
+
+ // printing out all parsed sentences
if (document != null) {
String[] sentences = document.getSentences();
if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index c7bc9ef3f..da3404b27 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -327,8 +327,9 @@ public class plasmaSnippetCache {
if (header == null) {
String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
- if (
+ if ( // if no extension is available
(p < 0) ||
+ // or the extension is supported by one of the parsers
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))
) {
String supposedMime = "text/html";
diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java
index 5f3338f79..e34443455 100644
--- a/source/de/anomic/server/logging/serverLog.java
+++ b/source/de/anomic/server/logging/serverLog.java
@@ -81,69 +81,98 @@ public final class serverLog {
public void setLevel(Level newLevel) {
this.theLogger.setLevel(newLevel);
}
-
+
public void logSevere(String message) {this.theLogger.severe(message);}
public void logSevere(String message, Throwable thrown) {this.theLogger.log(Level.SEVERE,message,thrown);}
+ public boolean isSevere() { return this.theLogger.isLoggable(Level.SEVERE); }
public void logWarning(String message) {this.theLogger.warning(message);}
public void logWarning(String message, Throwable thrown) {this.theLogger.log(Level.WARNING,message,thrown);}
-
+ public boolean isWarning() { return this.theLogger.isLoggable(Level.WARNING); }
+
public void logConfig(String message) {this.theLogger.config(message);}
public void logConfig(String message, Throwable thrown) {this.theLogger.log(Level.CONFIG,message,thrown);}
+ public boolean isConfig() { return this.theLogger.isLoggable(Level.CONFIG); }
public void logInfo(String message) {this.theLogger.info(message);}
public void logInfo(String message, Throwable thrown) {this.theLogger.log(Level.INFO,message,thrown);}
+ public boolean isInfo() { return this.theLogger.isLoggable(Level.INFO); }
public void logFine(String message) {this.theLogger.fine(message);}
public void logFine(String message, Throwable thrown) {this.theLogger.log(Level.FINE,message,thrown);}
+ public boolean isFine() { return this.theLogger.isLoggable(Level.FINE); }
public void logFiner(String message) {this.theLogger.finer(message);}
- public void logFiner(String message, Throwable thrown) {this.theLogger.log(Level.FINER,message,thrown);}
+ public void logFiner(String message, Throwable thrown) {this.theLogger.log(Level.FINER,message,thrown);}
+ public boolean isFiner() { return this.theLogger.isLoggable(Level.FINER); }
public void logFinest(String message) {this.theLogger.finest(message);}
- public void logFinest(String message, Throwable thrown) {this.theLogger.log(Level.FINEST,message,thrown);}
+ public void logFinest(String message, Throwable thrown) {this.theLogger.log(Level.FINEST,message,thrown);}
+ public boolean isFinest() { return this.theLogger.isLoggable(Level.FINEST); }
+
+ private void log(Level level, String msg, Throwable thrown) {
+ this.theLogger.log(level, msg, thrown);
+ }
public boolean isLoggable(Level level) {
return this.theLogger.isLoggable(level);
}
+
// static log messages: log everything
private static void log(String appName, int messageLevel, String message) {
Logger.getLogger(appName).log(Level.parse(Integer.toString(messageLevel)),message);
}
- private void log(Level level, String msg, Throwable thrown) {
- this.theLogger.log(level, msg, thrown);
- }
+
public static void logSevere(String appName, String message) {
Logger.getLogger(appName).severe(message);
}
public static void logSevere(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.SEVERE,message,thrown);
}
+ public static void isSevere(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.SEVERE);
+ }
+
public static void logWarning(String appName, String message) {
Logger.getLogger(appName).warning(message);
}
public static void logWarning(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.WARNING,message,thrown);
}
+ public static void isWarning(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.WARNING);
+ }
+
public static void logConfig(String appName, String message) {
Logger.getLogger(appName).config(message);
}
public static void logConfig(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.CONFIG,message,thrown);
}
+ public static void isConfig(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.CONFIG);
+ }
+
public static void logInfo(String appName, String message) {
Logger.getLogger(appName).info(message);
}
public static void logInfo(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.INFO,message,thrown);
}
+ public static void isInfo(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.INFO);
+ }
+
public static void logFine(String appName, String message) {
Logger.getLogger(appName).fine(message);
}
public static void logFine(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.FINE,message,thrown);
}
+ public static void isFine(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.FINE);
+ }
public static void logFiner(String appName, String message) {
Logger.getLogger(appName).finer(message);
@@ -151,6 +180,9 @@ public final class serverLog {
public static void logFiner(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.FINER,message,thrown);
}
+ public static void isFiner(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.FINER);
+ }
public static void logFinest(String appName, String message) {
Logger.getLogger(appName).finest(message);
@@ -158,6 +190,9 @@ public final class serverLog {
public static void logFinest(String appName, String message, Throwable thrown) {
Logger.getLogger(appName).log(Level.FINEST,message,thrown);
}
+ public static void isFinest(String appName) {
+ Logger.getLogger(appName).isLoggable(Level.FINEST);
+ }
public static final void configureLogging(File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException {
FileInputStream fileIn = null;