- plasmaParserDocument can process subdocuments now (other archive-parsers may want to use this method)

- added 7zip parser
- added 'text/sgml' to realtime parseable mimetypes (sometimes returned by the mime type parser)
- added new cached output stream class, very suitable for parsers because of limited memory

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3740 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent b1680ab71f
commit 0a64047081

@ -1,42 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="source"/>
<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
<classpathentry kind="src" path="htroot/htdocsdefault"/>
<classpathentry kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-collections.jar"/>
<classpathentry kind="lib" path="lib/commons-pool.jar"/>
<classpathentry kind="lib" path="lib/tar.jar"/>
<classpathentry kind="lib" path="libx/axis-ant.jar"/>
<classpathentry kind="lib" path="libx/axis.jar"/>
<classpathentry kind="lib" path="libx/bzip2.jar"/>
<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
<classpathentry kind="lib" path="libx/commons-logging.jar"/>
<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
<classpathentry kind="lib" path="libx/jdom.jar"/>
<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
<classpathentry kind="lib" path="libx/saaj.jar"/>
<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
<classpathentry kind="lib" path="libx/xerces.jar"/>
<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
<classpathentry kind="lib" path="libx/inetlib.jar"/>
<classpathentry kind="lib" path="libx/gnumail.jar"/>
<classpathentry kind="lib" path="libx/activation.jar"/>
<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="source"/>
<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
<classpathentry kind="src" path="htroot/htdocsdefault"/>
<classpathentry kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-collections.jar"/>
<classpathentry kind="lib" path="lib/commons-pool.jar"/>
<classpathentry kind="lib" path="lib/tar.jar"/>
<classpathentry kind="lib" path="libx/axis-ant.jar"/>
<classpathentry kind="lib" path="libx/axis.jar"/>
<classpathentry kind="lib" path="libx/bzip2.jar"/>
<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
<classpathentry kind="lib" path="libx/commons-logging.jar"/>
<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
<classpathentry kind="lib" path="libx/jdom.jar"/>
<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
<classpathentry kind="lib" path="libx/saaj.jar"/>
<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
<classpathentry kind="lib" path="libx/xerces.jar"/>
<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
<classpathentry kind="lib" path="libx/inetlib.jar"/>
<classpathentry kind="lib" path="libx/gnumail.jar"/>
<classpathentry kind="lib" path="libx/activation.jar"/>
<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
<classpathentry kind="lib" path="libx/J7Zip-modified.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -1,17 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>trunk</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
<name>yacy</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>

@ -3,6 +3,7 @@
###
# Extension = MIME type
7z = application/x-7z-compressed
ai = application/postscript
aiff = audio/x-aiff
au = audio/basic

Binary file not shown.

@ -53,6 +53,7 @@ import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.server.serverThread;
import de.anomic.server.logging.serverLog;
@ -138,6 +139,39 @@ public abstract class AbstractParser implements Parser{
return tempFile;
}
public int parseDir(URL location, String prefix, File dir, plasmaParserDocument doc)
throws ParserException, InterruptedException, IOException {
if (!dir.isDirectory())
throw new ParserException("tried to parse ordinary file " + dir + " as directory", location);
String[] files = dir.list();
int result = 0;
for (int i=0; i<files.length; i++) {
checkInterruption();
File file = new File(dir, files[i]);
this.theLogger.logFine("parsing file " + location + "#" + file + " in archive...");
if (file.isDirectory()) {
result += parseDir(location, prefix, file, doc);
} else try {
URL url = new URL(location, "/" + prefix + "/"
// XXX: workaround for relative paths within document
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+ "/" + file.getName());
plasmaParserDocument subdoc = new plasmaParser().parseSource(
url,
plasmaParser.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
null, file);
// TODO: change anchors back to use '#' after archive name
doc.addSubDocument(subdoc);
subdoc.close();
result++;
} catch (ParserException e) {
this.theLogger.logInfo("unable to parse file " + file + " in " + location + ", skipping");
}
}
return result;
}
/**
* Parsing a document available as byte array.
* @param location the origin of the document

@ -0,0 +1,86 @@
// ByteArrayIInStream.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.parser.sevenzip;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import SevenZip.IInStream;
public class ByteArrayIInStream extends IInStream {
private class SeekableByteArrayInputStream extends ByteArrayInputStream {
public SeekableByteArrayInputStream(byte[] buf) { super(buf); }
public SeekableByteArrayInputStream(byte[] buf, int off, int len) { super(buf, off, len); }
public int getPosition() { return super.pos; }
public void seekRelative(int offset) { seekAbsolute(super.pos + offset); }
public void seekAbsolute(int offset) {
if (offset > super.count)
throw new IndexOutOfBoundsException(Integer.toString(offset));
super.pos = offset;
}
}
private final SeekableByteArrayInputStream sbais;
public ByteArrayIInStream(byte[] buffer) {
this.sbais = new SeekableByteArrayInputStream(buffer);
}
public long Seek(long offset, int origin) {
switch (origin) {
case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break;
case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break;
}
return this.sbais.getPosition();
}
public int read() throws IOException {
return this.sbais.read();
}
public int read(byte[] b, int off, int len) throws IOException {
return this.sbais.read(b, off, len);
}
}

@ -0,0 +1,176 @@
// SZParserExtractCallback.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.parser.sevenzip;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.logging.serverLog;
import SevenZip.ArchiveExtractCallback;
import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content
public class SZParserExtractCallback extends ArchiveExtractCallback {
private final serverLog log;
private final long maxRamSize;
private serverCachedFileOutputStream cfos = null;
private final plasmaParser parser;
private final plasmaParserDocument doc;
private final String prefix;
public SZParserExtractCallback(serverLog logger, IInArchive handler,
long maxRamSize, plasmaParserDocument doc, String prefix) {
super.Init(handler);
this.log = logger;
this.maxRamSize = maxRamSize;
this.parser = new plasmaParser();
this.doc = doc;
this.prefix = prefix;
}
public void PrepareOperation(int arg0) {
this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
switch (arg0) {
case IInArchive.NExtract_NAskMode_kExtract:
this.log.logFine("Extracting " + this.filePath);
break;
case IInArchive.NExtract_NAskMode_kTest:
this.log.logFine("Testing " + this.filePath);
break;
case IInArchive.NExtract_NAskMode_kSkip:
this.log.logFine("Skipping " + this.filePath);
break;
};
}
public void SetOperationResult(int arg0) throws IOException {
if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
this.NumErrors++;
switch(arg0) {
case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
throw new IOException("Unsupported Method");
case IInArchive.NExtract_NOperationResult_kCRCError:
throw new IOException("CRC Failed");
case IInArchive.NExtract_NOperationResult_kDataError:
throw new IOException("Data Error");
default:
// throw new IOException("Unknown Error");
}
} else try {
AbstractParser.checkInterruption();
if (this.cfos != null) {
// parse the file
plasmaParserDocument theDoc;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
URL url = new URL(doc.getLocation(), this.prefix + "/" + super.filePath);
String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {
theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentBAOS());
}
// revert the above workaround
Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f);
Iterator it = theDoc.getAnchors().entrySet().iterator();
Map.Entry entry;
String base = doc.getLocation().toNormalform();
while (it.hasNext()) {
entry = (Map.Entry)it.next();
if (((String)entry.getKey()).startsWith(base + "/")) {
String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1);
this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref);
nanchors.put(base + ref, entry.getValue());
} else {
nanchors.put(entry.getKey(), entry.getValue());
}
}
theDoc.getAnchors().clear();
theDoc.getAnchors().putAll(nanchors);
this.doc.addSubDocument(theDoc);
}
} catch (ParserException e) {
IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage());
ex.initCause(e);
throw ex;
} catch (InterruptedException e) {
IOException ex = new IOException("interrupted");
ex.initCause(e);
throw ex;
}
}
public OutputStream GetStream(int index, int askExtractMode) throws IOException {
SevenZipEntry item = super.archiveHandler.getEntry(index);
super.filePath = item.getName();
try {
AbstractParser.checkInterruption();
} catch (InterruptedException e) {
IOException ex = new IOException("interrupted");
ex.initCause(e);
throw ex;
}
this.cfos = (item.isDirectory()) ? null
: new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
return this.cfos;
}
public String getCurrentFilePath() {
return super.filePath;
}
}

@ -0,0 +1,58 @@
<?xml version="1.0"?>
<project name="YACY - sevenzipParser" default="dist">
<description>
Some classes to parse 7zip files
</description>
<property name="parserShortName" value="sevenzip"/>
<property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="yacyContentParser_${parserShortName}"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>
<target name="compile">
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}"
destdir="${build}" source="${javacSource}" target="${javacTarget}"
debug="true" debuglevel="lines,vars,source">
<classpath>
<pathelement location="${build}" />
<pathelement location="${libx}/J7Zip-modified.jar" />
</classpath>
</javac>
</target>
<target name="zip" depends="compile">
<tar destfile="${parserArchive}" compression="gzip">
<tarfileset dir="${libx}"
includes="J7Zip-modified.jar"
prefix="${releaseFileParentDir}/libx/"
dirmode="755" mode="644"/>
<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/>
<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/>
</tar>
</target>
<target name="copy" depends="compile">
<copy todir="${release}/libx/">
<fileset dir="${libx}" includes="J7Zip-modified.jar"/>
</copy>
<copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy>
<copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy>
</target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project>

@ -0,0 +1,146 @@
// sevenzipParser.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.parser.sevenzip;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Hashtable;
import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.serverFileUtils;
public class sevenzipParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static {
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
}
/**
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" };
public sevenzipParser() throws IOException {
super(LIBX_DEPENDENCIES);
super.parserName = "7zip Archive Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
IInStream source, long maxRamSize) throws ParserException, InterruptedException {
plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
Handler archive;
super.theLogger.logFine("opening 7zip archive...");
try {
archive = new Handler(source);
} catch (IOException e) {
throw new ParserException("error opening 7zip archive", location, e);
}
checkInterruption();
SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
maxRamSize, doc, location.getFile());
super.theLogger.logFine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
return doc;
} catch (IOException e) {
if (e.getCause() instanceof InterruptedException)
throw (InterruptedException)e.getCause();
if (e.getCause() instanceof ParserException)
throw (ParserException)e.getCause();
throw new ParserException(
"error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
location, e);
} finally {
try { archive.close(); } catch (IOException e) { }
}
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
byte[] source) throws ParserException, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length);
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
File sourceFile) throws ParserException, InterruptedException {
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE);
} catch (IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
}
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {
try {
serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
serverFileUtils.copy(source, cfos);
if (cfos.isFallback()) {
return parse(location, mimeType, charset, cfos.getContentFile());
} else {
return parse(location, mimeType, charset, cfos.getContentBAOS());
}
} catch (IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
}
}
public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
}

@ -941,7 +941,7 @@ public final class plasmaParser {
plasmaParser theParser = new plasmaParser();
// configuring the realtime parsable mimeTypes
plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
// configure all other supported mimeTypes
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);

@ -46,28 +46,35 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.serverFileUtils;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.net.URL;
import de.anomic.plasma.parser.Parser;
public class plasmaParserDocument {
private URL location; // the source url
private String mimeType; // mimeType as taken from http header
private String charset; // the charset of the document
private String[] keywords; // most resources provide a keyword field
private String title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private String author; // author or copyright
private String[] sections; // if present: more titles/headlines appearing in the document
private String abstrct; // an abstract, if present: short content description
private List keywords; // most resources provide a keyword field
private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private StringBuffer author; // author or copyright
private List sections; // if present: more titles/headlines appearing in the document
private StringBuffer abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Map anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet images; // all visible pictures in document
@ -77,54 +84,63 @@ public class plasmaParserDocument {
private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks;
private boolean resorted;
private InputStream textStream;
public plasmaParserDocument(URL location, String mimeType, String charset,
private InputStream textStream;
protected plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
Object text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.keywords = (keywords==null) ? new String[0] : keywords;
this.title = (title==null)?"":title;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = (text==null)?new byte[0]:text;
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new TreeSet():images;
this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords);
this.title = (title == null) ? new StringBuffer() : new StringBuffer(title);
this.author = (author == null) ? new StringBuffer() : new StringBuffer(author);
this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections);
this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap(0) : anchors;
this.images = (images == null) ? new TreeSet() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
} catch (IOException e) {
e.printStackTrace();
this.text = new StringBuffer();
} else {
this.text = text;
}
}
public plasmaParserDocument(URL location, String mimeType, String charset) {
this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
File text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
this.charset = charset;
this.keywords = (keywords==null) ? new String[0] : keywords;
this.title = (title==null)?"":title;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = text;
if (text != null) text.deleteOnExit();
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new TreeSet():images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
}
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
serverCachedFileOutputStream text, Map anchors, TreeSet images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public URL getLocation() {
return this.location;
@ -142,19 +158,23 @@ public class plasmaParserDocument {
}
public String getTitle() {
return title;
return title.toString();
}
public String[] getSectionTitles() {
if (sections != null) return sections; else return new String[]{getTitle()};
if (sections != null) {
return (String[])sections.toArray(new String[this.sections.size()]);
} else {
return new String[] { getTitle() };
}
}
public String getAbstract() {
if (abstrct != null) return abstrct; else return getTitle();
if (abstrct != null) return abstrct.toString(); else return getTitle();
}
public String getAuthor() {
if (author != null) return author; else return "";
if (author != null) return author.toString(); else return new String();
}
public InputStream getText() {
@ -165,6 +185,8 @@ public class plasmaParserDocument {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
} else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getContent();
}
return this.textStream;
} catch (Exception e) {
@ -177,8 +199,18 @@ public class plasmaParserDocument {
try {
if (this.text == null) return new byte[0];
if (this.text instanceof File) return serverFileUtils.read((File)this.text);
else if (this.text instanceof byte[]) return (byte[])this.text;
if (this.text instanceof File) {
return serverFileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) {
return (byte[])this.text;
} else if (this.text instanceof serverCachedFileOutputStream) {
serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
if (ffbaos.isFallback()) {
return serverFileUtils.read(ffbaos.getContent());
} else {
return ffbaos.getContentBAOS();
}
}
} catch (Exception e) {
e.printStackTrace();
}
@ -189,6 +221,9 @@ public class plasmaParserDocument {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getLength();
}
return -1;
}
@ -204,19 +239,23 @@ public class plasmaParserDocument {
// sort out doubles and empty words
TreeSet hs = new TreeSet();
String s;
for (int i = 0; i < this.keywords.length; i++) {
if (this.keywords[i] == null) continue;
s = this.keywords[i].trim();
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = ((String)this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
StringBuffer sb = new StringBuffer(this.keywords.length * 6);
StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
Iterator i = hs.iterator();
while (i.hasNext()) sb.append((String) i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
public List getKeywords() {
return this.keywords;
}
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
@ -335,6 +374,27 @@ public class plasmaParserDocument {
this.resorted = true;
}
public void addSubDocument(plasmaParserDocument doc) throws IOException {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.getTitle());
this.keywords.addAll(doc.getKeywords());
if (this.abstrct.length() > 0) this.abstrct.append('\n');
this.abstrct.append(doc.getAbstract());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
serverFileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
}
serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
images.addAll(doc.getImages());
}
public void close() {
// try close the output stream
if (this.textStream != null) {

@ -0,0 +1,175 @@
// FileFallbackByteArrayOutputStream.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.server;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class serverCachedFileOutputStream extends ByteArrayOutputStream {
protected File fallbackFile;
protected long fallbackSize;
protected boolean buffered;
protected long size = 0;
protected boolean isFallback = false;
protected OutputStream fallback = null;
public serverCachedFileOutputStream(long fallbackSize) throws IOException {
this(fallbackSize, null, true, 32);
}
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered)
throws IOException {
this(fallbackSize, fallback, buffered, 32);
}
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
long size) throws IOException {
this.fallbackSize = fallbackSize;
this.fallbackFile = (fallback == null) ? File.createTempFile(
serverCachedFileOutputStream.class.getName(),
Long.toString(System.currentTimeMillis())) : fallback;
this.buffered = buffered;
checkFallback(size);
}
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
byte[] data) throws IOException {
this(fallbackSize, fallback, buffered, 0);
super.buf = data;
super.count = data.length;
checkFallback(this.size = data.length);
}
protected boolean checkFallback(long size) {
if (size > this.fallbackSize) try {
fallback();
return true;
} catch (IOException e) {
throw new RuntimeException("error falling back to file", e);
} else {
return false;
}
}
public void fallback() throws IOException {
if (this.isFallback) return;
this.isFallback = true;
if (!this.fallbackFile.exists()) {
this.fallbackFile.createNewFile();
} else if (this.fallbackFile.isDirectory()) {
throw new IOException("cannot write on a directory");
}
OutputStream os = new FileOutputStream(this.fallbackFile);
this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
serverFileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
super.buf = new byte[0];
super.count = 0;
super.reset();
}
public boolean isFallback() {
return this.isFallback;
}
public void write(int b) {
if (checkFallback(++this.size)) try {
this.fallback.write(b);
} catch (IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b);
}
}
public void write(byte[] b, int off, int len) {
if (checkFallback(this.size += len)) try {
this.fallback.write(b, off, len);
} catch (IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b, off, len);
}
}
public void close() throws IOException {
if (this.fallback != null)
this.fallback.close();
super.close();
}
public InputStream getContent() throws IOException {
close();
if (this.isFallback) {
InputStream is = new FileInputStream(this.fallbackFile);
return (this.buffered) ? new BufferedInputStream(is) : is;
} else {
return new ByteArrayInputStream(this.buf);
}
}
public byte[] getContentBAOS() {
if (this.isFallback)
throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
return super.buf;
}
public File getContentFile() {
if (!this.isFallback)
throw new RuntimeException("haven't fallen back yet, fallback file has no content");
return this.fallbackFile;
}
public long getLength() {
return this.size;
}
}

@ -204,7 +204,7 @@ proxyCacheMigration = true
#
# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
# parseableMime: specifies mime-types that can be indexed but not on the fly
parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml
parseableMimeTypes=
parseableMimeTypes.CRAWLER=
parseableMimeTypes.PROXY=

Loading…
Cancel
Save