diff --git a/.classpath b/.classpath
index dc7ec4add..3c7633e61 100644
--- a/.classpath
+++ b/.classpath
@@ -1,42 +1,43 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.project b/.project
index 3bbcaf65e..969b67fb5 100644
--- a/.project
+++ b/.project
@@ -1,17 +1,17 @@
-trunk
-
-
-
-
-
-org.eclipse.jdt.core.javabuilder
-
-
-
-
-
-org.eclipse.jdt.core.javanature
-
-
+ yacy
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+
+
diff --git a/httpd.mime b/httpd.mime
index 5b6ed7114..b86eb728d 100644
--- a/httpd.mime
+++ b/httpd.mime
@@ -3,6 +3,7 @@
###
# Extension = MIME type
+7z = application/x-7z-compressed
ai = application/postscript
aiff = audio/x-aiff
au = audio/basic
diff --git a/libx/J7Zip-modified.jar b/libx/J7Zip-modified.jar
new file mode 100644
index 000000000..176b3210e
Binary files /dev/null and b/libx/J7Zip-modified.jar differ
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index baa413a06..1d76b87c3 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -53,6 +53,7 @@ import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.server.serverThread;
import de.anomic.server.logging.serverLog;
@@ -138,6 +139,39 @@ public abstract class AbstractParser implements Parser{
return tempFile;
}
+ public int parseDir(URL location, String prefix, File dir, plasmaParserDocument doc)
+ throws ParserException, InterruptedException, IOException {
+ if (!dir.isDirectory())
+ throw new ParserException("tried to parse ordinary file " + dir + " as directory", location);
+
+ String[] files = dir.list();
+ int result = 0;
+ for (int i=0; i super.count)
+ throw new IndexOutOfBoundsException(Integer.toString(offset));
+ super.pos = offset;
+ }
+ }
+
+ private final SeekableByteArrayInputStream sbais;
+
+ public ByteArrayIInStream(byte[] buffer) {
+ this.sbais = new SeekableByteArrayInputStream(buffer);
+ }
+
+ public long Seek(long offset, int origin) {
+ switch (origin) {
+ case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break;
+ case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break;
+ }
+ return this.sbais.getPosition();
+ }
+
+ public int read() throws IOException {
+ return this.sbais.read();
+ }
+
+ public int read(byte[] b, int off, int len) throws IOException {
+ return this.sbais.read(b, off, len);
+ }
+}
\ No newline at end of file
diff --git a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java
new file mode 100644
index 000000000..6e1fc8569
--- /dev/null
+++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java
@@ -0,0 +1,176 @@
+// SZParserExtractCallback.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+//
+// This file ist contributed by Franz Brausze
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.sevenzip;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParser;
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.AbstractParser;
+import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverCachedFileOutputStream;
+import de.anomic.server.logging.serverLog;
+
+import SevenZip.ArchiveExtractCallback;
+import SevenZip.Archive.IInArchive;
+import SevenZip.Archive.SevenZipEntry;
+
+// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
+// and parse the extracted content
+public class SZParserExtractCallback extends ArchiveExtractCallback {
+
+ private final serverLog log;
+ private final long maxRamSize;
+ private serverCachedFileOutputStream cfos = null;
+ private final plasmaParser parser;
+ private final plasmaParserDocument doc;
+ private final String prefix;
+
+ public SZParserExtractCallback(serverLog logger, IInArchive handler,
+ long maxRamSize, plasmaParserDocument doc, String prefix) {
+ super.Init(handler);
+ this.log = logger;
+ this.maxRamSize = maxRamSize;
+ this.parser = new plasmaParser();
+ this.doc = doc;
+ this.prefix = prefix;
+ }
+
+ public void PrepareOperation(int arg0) {
+ this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
+ switch (arg0) {
+ case IInArchive.NExtract_NAskMode_kExtract:
+ this.log.logFine("Extracting " + this.filePath);
+ break;
+ case IInArchive.NExtract_NAskMode_kTest:
+ this.log.logFine("Testing " + this.filePath);
+ break;
+ case IInArchive.NExtract_NAskMode_kSkip:
+ this.log.logFine("Skipping " + this.filePath);
+ break;
+ };
+ }
+
+ public void SetOperationResult(int arg0) throws IOException {
+ if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
+ this.NumErrors++;
+ switch(arg0) {
+ case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
+ throw new IOException("Unsupported Method");
+ case IInArchive.NExtract_NOperationResult_kCRCError:
+ throw new IOException("CRC Failed");
+ case IInArchive.NExtract_NOperationResult_kDataError:
+ throw new IOException("Data Error");
+ default:
+ // throw new IOException("Unknown Error");
+ }
+ } else try {
+ AbstractParser.checkInterruption();
+
+ if (this.cfos != null) {
+ // parse the file
+ plasmaParserDocument theDoc;
+ // workaround for relative links in file, normally '#' shall be used behind the location, see
+ // below for reversion of the effects
+ URL url = new URL(doc.getLocation(), this.prefix + "/" + super.filePath);
+ String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
+ if (this.cfos.isFallback()) {
+ theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile());
+ } else {
+ theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentBAOS());
+ }
+
+ // revert the above workaround
+ Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f);
+ Iterator it = theDoc.getAnchors().entrySet().iterator();
+ Map.Entry entry;
+ String base = doc.getLocation().toNormalform();
+ while (it.hasNext()) {
+ entry = (Map.Entry)it.next();
+ if (((String)entry.getKey()).startsWith(base + "/")) {
+ String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1);
+ this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref);
+ nanchors.put(base + ref, entry.getValue());
+ } else {
+ nanchors.put(entry.getKey(), entry.getValue());
+ }
+ }
+ theDoc.getAnchors().clear();
+ theDoc.getAnchors().putAll(nanchors);
+ this.doc.addSubDocument(theDoc);
+ }
+ } catch (ParserException e) {
+ IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage());
+ ex.initCause(e);
+ throw ex;
+ } catch (InterruptedException e) {
+ IOException ex = new IOException("interrupted");
+ ex.initCause(e);
+ throw ex;
+ }
+ }
+
+ public OutputStream GetStream(int index, int askExtractMode) throws IOException {
+ SevenZipEntry item = super.archiveHandler.getEntry(index);
+ super.filePath = item.getName();
+ try {
+ AbstractParser.checkInterruption();
+ } catch (InterruptedException e) {
+ IOException ex = new IOException("interrupted");
+ ex.initCause(e);
+ throw ex;
+ }
+ this.cfos = (item.isDirectory()) ? null
+ : new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
+ return this.cfos;
+ }
+
+ public String getCurrentFilePath() {
+ return super.filePath;
+ }
+}
diff --git a/source/de/anomic/plasma/parser/sevenzip/build.xml b/source/de/anomic/plasma/parser/sevenzip/build.xml
new file mode 100644
index 000000000..6a7232ab7
--- /dev/null
+++ b/source/de/anomic/plasma/parser/sevenzip/build.xml
@@ -0,0 +1,58 @@
+
+
+
+ Some classes to parse 7zip files
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
new file mode 100644
index 000000000..ec9e0d40a
--- /dev/null
+++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
@@ -0,0 +1,146 @@
+// sevenzipParser.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+//
+// This file ist contributed by Franz Brausze
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.sevenzip;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Hashtable;
+
+import SevenZip.IInStream;
+import SevenZip.MyRandomAccessFile;
+import SevenZip.Archive.SevenZip.Handler;
+
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.AbstractParser;
+import de.anomic.plasma.parser.Parser;
+import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverCachedFileOutputStream;
+import de.anomic.server.serverFileUtils;
+
+public class sevenzipParser extends AbstractParser implements Parser {
+
+ /**
+ * a list of mime types that are supported by this parser class
+ * @see #getSupportedMimeTypes()
+ */
+ public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
+ static {
+ SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
+ }
+
+ /**
+ * a list of library names that are needed by this parser
+ * @see Parser#getLibxDependences()
+ */
+ private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" };
+
+ public sevenzipParser() throws IOException {
+ super(LIBX_DEPENDENCIES);
+ super.parserName = "7zip Archive Parser";
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
+ IInStream source, long maxRamSize) throws ParserException, InterruptedException {
+ plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
+ Handler archive;
+ super.theLogger.logFine("opening 7zip archive...");
+ try {
+ archive = new Handler(source);
+ } catch (IOException e) {
+ throw new ParserException("error opening 7zip archive", location, e);
+ }
+ checkInterruption();
+ SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
+ maxRamSize, doc, location.getFile());
+ super.theLogger.logFine("processing archive contents...");
+ try {
+ archive.Extract(null, -1, 0, aec);
+ return doc;
+ } catch (IOException e) {
+ if (e.getCause() instanceof InterruptedException)
+ throw (InterruptedException)e.getCause();
+ if (e.getCause() instanceof ParserException)
+ throw (ParserException)e.getCause();
+ throw new ParserException(
+ "error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
+ location, e);
+ } finally {
+ try { archive.close(); } catch (IOException e) { }
+ }
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
+ byte[] source) throws ParserException, InterruptedException {
+ return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length);
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
+ File sourceFile) throws ParserException, InterruptedException {
+ try {
+ return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE);
+ } catch (IOException e) {
+ throw new ParserException("error processing 7zip archive", location, e);
+ }
+ }
+
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
+ InputStream source) throws ParserException, InterruptedException {
+ try {
+ serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+ serverFileUtils.copy(source, cfos);
+ if (cfos.isFallback()) {
+ return parse(location, mimeType, charset, cfos.getContentFile());
+ } else {
+ return parse(location, mimeType, charset, cfos.getContentBAOS());
+ }
+ } catch (IOException e) {
+ throw new ParserException("error processing 7zip archive", location, e);
+ }
+ }
+
+ public Hashtable getSupportedMimeTypes() {
+ return SUPPORTED_MIME_TYPES;
+ }
+}
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 49d5e37f0..51554ef44 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -941,7 +941,7 @@ public final class plasmaParser {
plasmaParser theParser = new plasmaParser();
// configuring the realtime parsable mimeTypes
- plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
+ plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
// configure all other supported mimeTypes
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index 66e05f18a..0e378b529 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -46,28 +46,35 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
+import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
+
+import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.serverFileUtils;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.net.URL;
+import de.anomic.plasma.parser.Parser;
public class plasmaParserDocument {
private URL location; // the source url
private String mimeType; // mimeType as taken from http header
private String charset; // the charset of the document
- private String[] keywords; // most resources provide a keyword field
- private String title; // a document title, taken from title or h1 tag; shall appear as headline of search result
- private String author; // author or copyright
- private String[] sections; // if present: more titles/headlines appearing in the document
- private String abstrct; // an abstract, if present: short content description
+ private List keywords; // most resources provide a keyword field
+ private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
+ private StringBuffer author; // author or copyright
+ private List sections; // if present: more titles/headlines appearing in the document
+ private StringBuffer abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Map anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet images; // all visible pictures in document
@@ -77,54 +84,63 @@ public class plasmaParserDocument {
private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks;
private boolean resorted;
- private InputStream textStream;
-
- public plasmaParserDocument(URL location, String mimeType, String charset,
+ private InputStream textStream;
+
+ protected plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
- byte[] text, Map anchors, TreeSet images) {
+ Object text, Map anchors, TreeSet images) {
this.location = location;
- this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+ this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
- this.keywords = (keywords==null) ? new String[0] : keywords;
- this.title = (title==null)?"":title;
- this.author = (author==null)?"":author;
- this.sections = (sections==null)?new String[0]:sections;
- this.abstrct = (abstrct==null)?"":abstrct;
- this.text = (text==null)?new byte[0]:text;
- this.anchors = (anchors==null)?new HashMap(0):anchors;
- this.images = (images==null)?new TreeSet():images;
+ this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords);
+ this.title = (title == null) ? new StringBuffer() : new StringBuffer(title);
+ this.author = (author == null) ? new StringBuffer() : new StringBuffer(author);
+ this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections);
+ this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
+ this.anchors = (anchors == null) ? new HashMap(0) : anchors;
+ this.images = (images == null) ? new TreeSet() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
+
+ if (text == null) try {
+ this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+ } catch (IOException e) {
+ e.printStackTrace();
+ this.text = new StringBuffer();
+ } else {
+ this.text = text;
+ }
+ }
+
+ public plasmaParserDocument(URL location, String mimeType, String charset) {
+ this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
+ }
+
+ public plasmaParserDocument(URL location, String mimeType, String charset,
+ String[] keywords, String title, String author,
+ String[] sections, String abstrct,
+ byte[] text, Map anchors, TreeSet images) {
+ this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
File text, Map anchors, TreeSet images) {
- this.location = location;
- this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
- this.charset = charset;
- this.keywords = (keywords==null) ? new String[0] : keywords;
- this.title = (title==null)?"":title;
- this.author = (author==null)?"":author;
- this.sections = (sections==null)?new String[0]:sections;
- this.abstrct = (abstrct==null)?"":abstrct;
- this.text = text;
- if (text != null) text.deleteOnExit();
- this.anchors = (anchors==null)?new HashMap(0):anchors;
- this.images = (images==null)?new TreeSet():images;
- this.hyperlinks = null;
- this.audiolinks = null;
- this.videolinks = null;
- this.applinks = null;
- this.emaillinks = null;
- this.resorted = false;
- }
+ this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+ }
+
+ public plasmaParserDocument(URL location, String mimeType, String charset,
+ String[] keywords, String title, String author,
+ String[] sections, String abstrct,
+ serverCachedFileOutputStream text, Map anchors, TreeSet images) {
+ this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+ }
public URL getLocation() {
return this.location;
@@ -142,19 +158,23 @@ public class plasmaParserDocument {
}
public String getTitle() {
- return title;
+ return title.toString();
}
public String[] getSectionTitles() {
- if (sections != null) return sections; else return new String[]{getTitle()};
+ if (sections != null) {
+ return (String[])sections.toArray(new String[this.sections.size()]);
+ } else {
+ return new String[] { getTitle() };
+ }
}
public String getAbstract() {
- if (abstrct != null) return abstrct; else return getTitle();
+ if (abstrct != null) return abstrct.toString(); else return getTitle();
}
public String getAuthor() {
- if (author != null) return author; else return "";
+ if (author != null) return author.toString(); else return new String();
}
public InputStream getText() {
@@ -165,6 +185,8 @@ public class plasmaParserDocument {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
+ } else if (this.text instanceof serverCachedFileOutputStream) {
+ return ((serverCachedFileOutputStream)this.text).getContent();
}
return this.textStream;
} catch (Exception e) {
@@ -177,8 +199,18 @@ public class plasmaParserDocument {
try {
if (this.text == null) return new byte[0];
- if (this.text instanceof File) return serverFileUtils.read((File)this.text);
- else if (this.text instanceof byte[]) return (byte[])this.text;
+ if (this.text instanceof File) {
+ return serverFileUtils.read((File)this.text);
+ } else if (this.text instanceof byte[]) {
+ return (byte[])this.text;
+ } else if (this.text instanceof serverCachedFileOutputStream) {
+ serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
+ if (ffbaos.isFallback()) {
+ return serverFileUtils.read(ffbaos.getContent());
+ } else {
+ return ffbaos.getContentBAOS();
+ }
+ }
} catch (Exception e) {
e.printStackTrace();
}
@@ -189,6 +221,9 @@ public class plasmaParserDocument {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
+ else if (this.text instanceof serverCachedFileOutputStream) {
+ return ((serverCachedFileOutputStream)this.text).getLength();
+ }
return -1;
}
@@ -204,19 +239,23 @@ public class plasmaParserDocument {
// sort out doubles and empty words
TreeSet hs = new TreeSet();
String s;
- for (int i = 0; i < this.keywords.length; i++) {
- if (this.keywords[i] == null) continue;
- s = this.keywords[i].trim();
+ for (int i = 0; i < this.keywords.size(); i++) {
+ if (this.keywords.get(i) == null) continue;
+ s = ((String)this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
- StringBuffer sb = new StringBuffer(this.keywords.length * 6);
+ StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
Iterator i = hs.iterator();
while (i.hasNext()) sb.append((String) i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
+ public List getKeywords() {
+ return this.keywords;
+ }
+
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
@@ -335,6 +374,27 @@ public class plasmaParserDocument {
this.resorted = true;
}
+ public void addSubDocument(plasmaParserDocument doc) throws IOException {
+ this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
+
+ if (this.title.length() > 0) this.title.append('\n');
+ this.title.append(doc.getTitle());
+
+ this.keywords.addAll(doc.getKeywords());
+
+ if (this.abstrct.length() > 0) this.abstrct.append('\n');
+ this.abstrct.append(doc.getAbstract());
+
+ if (!(this.text instanceof serverCachedFileOutputStream)) {
+ this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+ serverFileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
+ }
+ serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
+
+ anchors.putAll(doc.getAnchors());
+ images.addAll(doc.getImages());
+ }
+
public void close() {
// try close the output stream
if (this.textStream != null) {
diff --git a/source/de/anomic/server/serverCachedFileOutputStream.java b/source/de/anomic/server/serverCachedFileOutputStream.java
new file mode 100644
index 000000000..f42b9db35
--- /dev/null
+++ b/source/de/anomic/server/serverCachedFileOutputStream.java
@@ -0,0 +1,175 @@
+// FileFallbackByteArrayOutputStream.java
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+//
+// This file ist contributed by Franz Brausze
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.server;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class serverCachedFileOutputStream extends ByteArrayOutputStream {
+
+ protected File fallbackFile;
+ protected long fallbackSize;
+ protected boolean buffered;
+
+ protected long size = 0;
+ protected boolean isFallback = false;
+ protected OutputStream fallback = null;
+
+ public serverCachedFileOutputStream(long fallbackSize) throws IOException {
+ this(fallbackSize, null, true, 32);
+ }
+
+ public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered)
+ throws IOException {
+ this(fallbackSize, fallback, buffered, 32);
+ }
+
+ public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
+ long size) throws IOException {
+ this.fallbackSize = fallbackSize;
+ this.fallbackFile = (fallback == null) ? File.createTempFile(
+ serverCachedFileOutputStream.class.getName(),
+ Long.toString(System.currentTimeMillis())) : fallback;
+ this.buffered = buffered;
+ checkFallback(size);
+ }
+
+ public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
+ byte[] data) throws IOException {
+ this(fallbackSize, fallback, buffered, 0);
+ super.buf = data;
+ super.count = data.length;
+ checkFallback(this.size = data.length);
+ }
+
+ protected boolean checkFallback(long size) {
+ if (size > this.fallbackSize) try {
+ fallback();
+ return true;
+ } catch (IOException e) {
+ throw new RuntimeException("error falling back to file", e);
+ } else {
+ return false;
+ }
+ }
+
+ public void fallback() throws IOException {
+ if (this.isFallback) return;
+ this.isFallback = true;
+ if (!this.fallbackFile.exists()) {
+ this.fallbackFile.createNewFile();
+ } else if (this.fallbackFile.isDirectory()) {
+ throw new IOException("cannot write on a directory");
+ }
+ OutputStream os = new FileOutputStream(this.fallbackFile);
+ this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
+ serverFileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
+ super.buf = new byte[0];
+ super.count = 0;
+ super.reset();
+ }
+
+ public boolean isFallback() {
+ return this.isFallback;
+ }
+
+ public void write(int b) {
+ if (checkFallback(++this.size)) try {
+ this.fallback.write(b);
+ } catch (IOException e) {
+ throw new RuntimeException("error writing to fallback", e);
+ } else {
+ super.write(b);
+ }
+ }
+
+ public void write(byte[] b, int off, int len) {
+ if (checkFallback(this.size += len)) try {
+ this.fallback.write(b, off, len);
+ } catch (IOException e) {
+ throw new RuntimeException("error writing to fallback", e);
+ } else {
+ super.write(b, off, len);
+ }
+ }
+
+ public void close() throws IOException {
+ if (this.fallback != null)
+ this.fallback.close();
+ super.close();
+ }
+
+ public InputStream getContent() throws IOException {
+ close();
+ if (this.isFallback) {
+ InputStream is = new FileInputStream(this.fallbackFile);
+ return (this.buffered) ? new BufferedInputStream(is) : is;
+ } else {
+ return new ByteArrayInputStream(this.buf);
+ }
+ }
+
+ public byte[] getContentBAOS() {
+ if (this.isFallback)
+ throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
+ return super.buf;
+ }
+
+ public File getContentFile() {
+ if (!this.isFallback)
+ throw new RuntimeException("haven't fallen back yet, fallback file has no content");
+ return this.fallbackFile;
+ }
+
+ public long getLength() {
+ return this.size;
+ }
+}
diff --git a/yacy.init b/yacy.init
index 8a06c8a4a..f292b1ef8 100644
--- a/yacy.init
+++ b/yacy.init
@@ -204,7 +204,7 @@ proxyCacheMigration = true
#
# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
# parseableMime: specifies mime-types that can be indexed but not on the fly
-parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
+parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml
parseableMimeTypes=
parseableMimeTypes.CRAWLER=
parseableMimeTypes.PROXY=