diff --git a/.classpath b/.classpath index dc7ec4add..3c7633e61 100644 --- a/.classpath +++ b/.classpath @@ -1,42 +1,43 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project index 3bbcaf65e..969b67fb5 100644 --- a/.project +++ b/.project @@ -1,17 +1,17 @@ -trunk - - - - - -org.eclipse.jdt.core.javabuilder - - - - - -org.eclipse.jdt.core.javanature - - + yacy + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/httpd.mime b/httpd.mime index 5b6ed7114..b86eb728d 100644 --- a/httpd.mime +++ b/httpd.mime @@ -3,6 +3,7 @@ ### # Extension = MIME type +7z = application/x-7z-compressed ai = application/postscript aiff = audio/x-aiff au = audio/basic diff --git a/libx/J7Zip-modified.jar b/libx/J7Zip-modified.jar new file mode 100644 index 000000000..176b3210e Binary files /dev/null and b/libx/J7Zip-modified.jar differ diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index baa413a06..1d76b87c3 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -53,6 +53,7 @@ import java.io.IOException; import java.io.InputStream; import de.anomic.net.URL; +import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.server.serverThread; import de.anomic.server.logging.serverLog; @@ -138,6 +139,39 @@ public abstract class AbstractParser implements Parser{ return tempFile; } + public int parseDir(URL location, String prefix, File dir, plasmaParserDocument doc) + throws ParserException, InterruptedException, IOException { + if (!dir.isDirectory()) + throw new ParserException("tried to parse ordinary file " + dir + " as directory", location); + + String[] files = dir.list(); + int result = 0; + for (int i=0; i super.count) + throw new IndexOutOfBoundsException(Integer.toString(offset)); + super.pos = offset; + } + } + + private final SeekableByteArrayInputStream sbais; + + public ByteArrayIInStream(byte[] buffer) { + this.sbais = new SeekableByteArrayInputStream(buffer); + } + + public long Seek(long offset, int origin) { + switch (origin) { + case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break; + case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break; + } + return this.sbais.getPosition(); + } + + public int read() throws IOException { + return this.sbais.read(); + } + + public int read(byte[] b, int off, int len) throws IOException { + return this.sbais.read(b, off, len); + } +} \ No newline at end of file diff --git a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java new file mode 100644 index 000000000..6e1fc8569 --- /dev/null +++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java @@ -0,0 +1,176 @@ +// SZParserExtractCallback.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// This file ist contributed by Franz Brausze +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma.parser.sevenzip; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import de.anomic.net.URL; +import de.anomic.plasma.plasmaParser; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; +import de.anomic.plasma.parser.ParserException; +import de.anomic.server.serverCachedFileOutputStream; +import de.anomic.server.logging.serverLog; + +import SevenZip.ArchiveExtractCallback; +import SevenZip.Archive.IInArchive; +import SevenZip.Archive.SevenZipEntry; + +// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog +// and parse the extracted content +public class SZParserExtractCallback extends ArchiveExtractCallback { + + private final serverLog log; + private final long maxRamSize; + private serverCachedFileOutputStream cfos = null; + private final plasmaParser parser; + private final plasmaParserDocument doc; + private final String prefix; + + public SZParserExtractCallback(serverLog logger, IInArchive handler, + long maxRamSize, plasmaParserDocument doc, String prefix) { + super.Init(handler); + this.log = logger; + this.maxRamSize = maxRamSize; + this.parser = new plasmaParser(); + this.doc = doc; + this.prefix = prefix; + } + + public void PrepareOperation(int arg0) { + this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract); + switch (arg0) { + case IInArchive.NExtract_NAskMode_kExtract: + this.log.logFine("Extracting " + this.filePath); + break; + case IInArchive.NExtract_NAskMode_kTest: + this.log.logFine("Testing " + this.filePath); + break; + case IInArchive.NExtract_NAskMode_kSkip: + this.log.logFine("Skipping " + this.filePath); + break; + }; + } + + public void SetOperationResult(int arg0) throws IOException { + if (arg0 != IInArchive.NExtract_NOperationResult_kOK) { + this.NumErrors++; + switch(arg0) { + case IInArchive.NExtract_NOperationResult_kUnSupportedMethod: + throw new IOException("Unsupported Method"); + case IInArchive.NExtract_NOperationResult_kCRCError: + throw new IOException("CRC Failed"); + case IInArchive.NExtract_NOperationResult_kDataError: + throw new IOException("Data Error"); + default: + // throw new IOException("Unknown Error"); + } + } else try { + AbstractParser.checkInterruption(); + + if (this.cfos != null) { + // parse the file + plasmaParserDocument theDoc; + // workaround for relative links in file, normally '#' shall be used behind the location, see + // below for reversion of the effects + URL url = new URL(doc.getLocation(), this.prefix + "/" + super.filePath); + String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); + if (this.cfos.isFallback()) { + theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile()); + } else { + theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentBAOS()); + } + + // revert the above workaround + Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f); + Iterator it = theDoc.getAnchors().entrySet().iterator(); + Map.Entry entry; + String base = doc.getLocation().toNormalform(); + while (it.hasNext()) { + entry = (Map.Entry)it.next(); + if (((String)entry.getKey()).startsWith(base + "/")) { + String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1); + this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref); + nanchors.put(base + ref, entry.getValue()); + } else { + nanchors.put(entry.getKey(), entry.getValue()); + } + } + theDoc.getAnchors().clear(); + theDoc.getAnchors().putAll(nanchors); + this.doc.addSubDocument(theDoc); + } + } catch (ParserException e) { + IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage()); + ex.initCause(e); + throw ex; + } catch (InterruptedException e) { + IOException ex = new IOException("interrupted"); + ex.initCause(e); + throw ex; + } + } + + public OutputStream GetStream(int index, int askExtractMode) throws IOException { + SevenZipEntry item = super.archiveHandler.getEntry(index); + super.filePath = item.getName(); + try { + AbstractParser.checkInterruption(); + } catch (InterruptedException e) { + IOException ex = new IOException("interrupted"); + ex.initCause(e); + throw ex; + } + this.cfos = (item.isDirectory()) ? null + : new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize()); + return this.cfos; + } + + public String getCurrentFilePath() { + return super.filePath; + } +} diff --git a/source/de/anomic/plasma/parser/sevenzip/build.xml b/source/de/anomic/plasma/parser/sevenzip/build.xml new file mode 100644 index 000000000..6a7232ab7 --- /dev/null +++ b/source/de/anomic/plasma/parser/sevenzip/build.xml @@ -0,0 +1,58 @@ + + + + Some classes to parse 7zip files + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java new file mode 100644 index 000000000..ec9e0d40a --- /dev/null +++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java @@ -0,0 +1,146 @@ +// sevenzipParser.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// This file ist contributed by Franz Brausze +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma.parser.sevenzip; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Hashtable; + +import SevenZip.IInStream; +import SevenZip.MyRandomAccessFile; +import SevenZip.Archive.SevenZip.Handler; + +import de.anomic.net.URL; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; +import de.anomic.plasma.parser.Parser; +import de.anomic.plasma.parser.ParserException; +import de.anomic.server.serverCachedFileOutputStream; +import de.anomic.server.serverFileUtils; + +public class sevenzipParser extends AbstractParser implements Parser { + + /** + * a list of mime types that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { + SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); + } + + /** + * a list of library names that are needed by this parser + * @see Parser#getLibxDependences() + */ + private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" }; + + public sevenzipParser() throws IOException { + super(LIBX_DEPENDENCIES); + super.parserName = "7zip Archive Parser"; + } + + public plasmaParserDocument parse(URL location, String mimeType, String charset, + IInStream source, long maxRamSize) throws ParserException, InterruptedException { + plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset); + Handler archive; + super.theLogger.logFine("opening 7zip archive..."); + try { + archive = new Handler(source); + } catch (IOException e) { + throw new ParserException("error opening 7zip archive", location, e); + } + checkInterruption(); + SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive, + maxRamSize, doc, location.getFile()); + super.theLogger.logFine("processing archive contents..."); + try { + archive.Extract(null, -1, 0, aec); + return doc; + } catch (IOException e) { + if (e.getCause() instanceof InterruptedException) + throw (InterruptedException)e.getCause(); + if (e.getCause() instanceof ParserException) + throw (ParserException)e.getCause(); + throw new ParserException( + "error processing 7zip archive at internal file: " + aec.getCurrentFilePath(), + location, e); + } finally { + try { archive.close(); } catch (IOException e) { } + } + } + + public plasmaParserDocument parse(URL location, String mimeType, String charset, + byte[] source) throws ParserException, InterruptedException { + return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length); + } + + public plasmaParserDocument parse(URL location, String mimeType, String charset, + File sourceFile) throws ParserException, InterruptedException { + try { + return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE); + } catch (IOException e) { + throw new ParserException("error processing 7zip archive", location, e); + } + } + + public plasmaParserDocument parse(URL location, String mimeType, String charset, + InputStream source) throws ParserException, InterruptedException { + try { + serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + serverFileUtils.copy(source, cfos); + if (cfos.isFallback()) { + return parse(location, mimeType, charset, cfos.getContentFile()); + } else { + return parse(location, mimeType, charset, cfos.getContentBAOS()); + } + } catch (IOException e) { + throw new ParserException("error processing 7zip archive", location, e); + } + } + + public Hashtable getSupportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } +} diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 49d5e37f0..51554ef44 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -941,7 +941,7 @@ public final class plasmaParser { plasmaParser theParser = new plasmaParser(); // configuring the realtime parsable mimeTypes - plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain"); + plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml"); // configure all other supported mimeTypes plasmaParser.enableAllParsers(PARSER_MODE_PROXY); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 66e05f18a..0e378b529 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -46,28 +46,35 @@ import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; + +import de.anomic.server.serverCachedFileOutputStream; import de.anomic.server.serverFileUtils; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; import java.util.Map; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.net.URL; +import de.anomic.plasma.parser.Parser; public class plasmaParserDocument { private URL location; // the source url private String mimeType; // mimeType as taken from http header private String charset; // the charset of the document - private String[] keywords; // most resources provide a keyword field - private String title; // a document title, taken from title or h1 tag; shall appear as headline of search result - private String author; // author or copyright - private String[] sections; // if present: more titles/headlines appearing in the document - private String abstrct; // an abstract, if present: short content description + private List keywords; // most resources provide a keyword field + private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result + private StringBuffer author; // author or copyright + private List sections; // if present: more titles/headlines appearing in the document + private StringBuffer abstrct; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private Map anchors; // all links embedded as clickeable entities (anchor tags) private TreeSet images; // all visible pictures in document @@ -77,54 +84,63 @@ public class plasmaParserDocument { private Map hyperlinks, audiolinks, videolinks, applinks; private Map emaillinks; private boolean resorted; - private InputStream textStream; - - public plasmaParserDocument(URL location, String mimeType, String charset, + private InputStream textStream; + + protected plasmaParserDocument(URL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - byte[] text, Map anchors, TreeSet images) { + Object text, Map anchors, TreeSet images) { this.location = location; - this.mimeType = (mimeType==null)?"application/octet-stream":mimeType; + this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; - this.keywords = (keywords==null) ? new String[0] : keywords; - this.title = (title==null)?"":title; - this.author = (author==null)?"":author; - this.sections = (sections==null)?new String[0]:sections; - this.abstrct = (abstrct==null)?"":abstrct; - this.text = (text==null)?new byte[0]:text; - this.anchors = (anchors==null)?new HashMap(0):anchors; - this.images = (images==null)?new TreeSet():images; + this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); + this.title = (title == null) ? new StringBuffer() : new StringBuffer(title); + this.author = (author == null) ? new StringBuffer() : new StringBuffer(author); + this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); + this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct); + this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.images = (images == null) ? new TreeSet() : images; this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; this.applinks = null; this.emaillinks = null; this.resorted = false; + + if (text == null) try { + this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + } catch (IOException e) { + e.printStackTrace(); + this.text = new StringBuffer(); + } else { + this.text = text; + } + } + + public plasmaParserDocument(URL location, String mimeType, String charset) { + this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null); + } + + public plasmaParserDocument(URL location, String mimeType, String charset, + String[] keywords, String title, String author, + String[] sections, String abstrct, + byte[] text, Map anchors, TreeSet images) { + this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(URL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, File text, Map anchors, TreeSet images) { - this.location = location; - this.mimeType = (mimeType==null)?"application/octet-stream":mimeType; - this.charset = charset; - this.keywords = (keywords==null) ? new String[0] : keywords; - this.title = (title==null)?"":title; - this.author = (author==null)?"":author; - this.sections = (sections==null)?new String[0]:sections; - this.abstrct = (abstrct==null)?"":abstrct; - this.text = text; - if (text != null) text.deleteOnExit(); - this.anchors = (anchors==null)?new HashMap(0):anchors; - this.images = (images==null)?new TreeSet():images; - this.hyperlinks = null; - this.audiolinks = null; - this.videolinks = null; - this.applinks = null; - this.emaillinks = null; - this.resorted = false; - } + this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); + } + + public plasmaParserDocument(URL location, String mimeType, String charset, + String[] keywords, String title, String author, + String[] sections, String abstrct, + serverCachedFileOutputStream text, Map anchors, TreeSet images) { + this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); + } public URL getLocation() { return this.location; @@ -142,19 +158,23 @@ public class plasmaParserDocument { } public String getTitle() { - return title; + return title.toString(); } public String[] getSectionTitles() { - if (sections != null) return sections; else return new String[]{getTitle()}; + if (sections != null) { + return (String[])sections.toArray(new String[this.sections.size()]); + } else { + return new String[] { getTitle() }; + } } public String getAbstract() { - if (abstrct != null) return abstrct; else return getTitle(); + if (abstrct != null) return abstrct.toString(); else return getTitle(); } public String getAuthor() { - if (author != null) return author; else return ""; + if (author != null) return author.toString(); else return new String(); } public InputStream getText() { @@ -165,6 +185,8 @@ public class plasmaParserDocument { this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof byte[]) { this.textStream = new ByteArrayInputStream((byte[])this.text); + } else if (this.text instanceof serverCachedFileOutputStream) { + return ((serverCachedFileOutputStream)this.text).getContent(); } return this.textStream; } catch (Exception e) { @@ -177,8 +199,18 @@ public class plasmaParserDocument { try { if (this.text == null) return new byte[0]; - if (this.text instanceof File) return serverFileUtils.read((File)this.text); - else if (this.text instanceof byte[]) return (byte[])this.text; + if (this.text instanceof File) { + return serverFileUtils.read((File)this.text); + } else if (this.text instanceof byte[]) { + return (byte[])this.text; + } else if (this.text instanceof serverCachedFileOutputStream) { + serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text; + if (ffbaos.isFallback()) { + return serverFileUtils.read(ffbaos.getContent()); + } else { + return ffbaos.getContentBAOS(); + } + } } catch (Exception e) { e.printStackTrace(); } @@ -189,6 +221,9 @@ public class plasmaParserDocument { if (this.text == null) return 0; if (this.text instanceof File) return ((File)this.text).length(); else if (this.text instanceof byte[]) return ((byte[])this.text).length; + else if (this.text instanceof serverCachedFileOutputStream) { + return ((serverCachedFileOutputStream)this.text).getLength(); + } return -1; } @@ -204,19 +239,23 @@ public class plasmaParserDocument { // sort out doubles and empty words TreeSet hs = new TreeSet(); String s; - for (int i = 0; i < this.keywords.length; i++) { - if (this.keywords[i] == null) continue; - s = this.keywords[i].trim(); + for (int i = 0; i < this.keywords.size(); i++) { + if (this.keywords.get(i) == null) continue; + s = ((String)this.keywords.get(i)).trim(); if (s.length() > 0) hs.add(s.toLowerCase()); } if (hs.size() == 0) return ""; // generate a new list - StringBuffer sb = new StringBuffer(this.keywords.length * 6); + StringBuffer sb = new StringBuffer(this.keywords.size() * 6); Iterator i = hs.iterator(); while (i.hasNext()) sb.append((String) i.next()).append(separator); return sb.substring(0, sb.length() - 1); } + public List getKeywords() { + return this.keywords; + } + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map @@ -335,6 +374,27 @@ public class plasmaParserDocument { this.resorted = true; } + public void addSubDocument(plasmaParserDocument doc) throws IOException { + this.sections.addAll(Arrays.asList(doc.getSectionTitles())); + + if (this.title.length() > 0) this.title.append('\n'); + this.title.append(doc.getTitle()); + + this.keywords.addAll(doc.getKeywords()); + + if (this.abstrct.length() > 0) this.abstrct.append('\n'); + this.abstrct.append(doc.getAbstract()); + + if (!(this.text instanceof serverCachedFileOutputStream)) { + this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + serverFileUtils.copy(getText(), (serverCachedFileOutputStream)this.text); + } + serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text); + + anchors.putAll(doc.getAnchors()); + images.addAll(doc.getImages()); + } + public void close() { // try close the output stream if (this.textStream != null) { diff --git a/source/de/anomic/server/serverCachedFileOutputStream.java b/source/de/anomic/server/serverCachedFileOutputStream.java new file mode 100644 index 000000000..f42b9db35 --- /dev/null +++ b/source/de/anomic/server/serverCachedFileOutputStream.java @@ -0,0 +1,175 @@ +// FileFallbackByteArrayOutputStream.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// This file ist contributed by Franz Brausze +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.server; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +public class serverCachedFileOutputStream extends ByteArrayOutputStream { + + protected File fallbackFile; + protected long fallbackSize; + protected boolean buffered; + + protected long size = 0; + protected boolean isFallback = false; + protected OutputStream fallback = null; + + public serverCachedFileOutputStream(long fallbackSize) throws IOException { + this(fallbackSize, null, true, 32); + } + + public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered) + throws IOException { + this(fallbackSize, fallback, buffered, 32); + } + + public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered, + long size) throws IOException { + this.fallbackSize = fallbackSize; + this.fallbackFile = (fallback == null) ? File.createTempFile( + serverCachedFileOutputStream.class.getName(), + Long.toString(System.currentTimeMillis())) : fallback; + this.buffered = buffered; + checkFallback(size); + } + + public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered, + byte[] data) throws IOException { + this(fallbackSize, fallback, buffered, 0); + super.buf = data; + super.count = data.length; + checkFallback(this.size = data.length); + } + + protected boolean checkFallback(long size) { + if (size > this.fallbackSize) try { + fallback(); + return true; + } catch (IOException e) { + throw new RuntimeException("error falling back to file", e); + } else { + return false; + } + } + + public void fallback() throws IOException { + if (this.isFallback) return; + this.isFallback = true; + if (!this.fallbackFile.exists()) { + this.fallbackFile.createNewFile(); + } else if (this.fallbackFile.isDirectory()) { + throw new IOException("cannot write on a directory"); + } + OutputStream os = new FileOutputStream(this.fallbackFile); + this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os; + serverFileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback); + super.buf = new byte[0]; + super.count = 0; + super.reset(); + } + + public boolean isFallback() { + return this.isFallback; + } + + public void write(int b) { + if (checkFallback(++this.size)) try { + this.fallback.write(b); + } catch (IOException e) { + throw new RuntimeException("error writing to fallback", e); + } else { + super.write(b); + } + } + + public void write(byte[] b, int off, int len) { + if (checkFallback(this.size += len)) try { + this.fallback.write(b, off, len); + } catch (IOException e) { + throw new RuntimeException("error writing to fallback", e); + } else { + super.write(b, off, len); + } + } + + public void close() throws IOException { + if (this.fallback != null) + this.fallback.close(); + super.close(); + } + + public InputStream getContent() throws IOException { + close(); + if (this.isFallback) { + InputStream is = new FileInputStream(this.fallbackFile); + return (this.buffered) ? new BufferedInputStream(is) : is; + } else { + return new ByteArrayInputStream(this.buf); + } + } + + public byte[] getContentBAOS() { + if (this.isFallback) + throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file"); + return super.buf; + } + + public File getContentFile() { + if (!this.isFallback) + throw new RuntimeException("haven't fallen back yet, fallback file has no content"); + return this.fallbackFile; + } + + public long getLength() { + return this.size; + } +} diff --git a/yacy.init b/yacy.init index 8a06c8a4a..f292b1ef8 100644 --- a/yacy.init +++ b/yacy.init @@ -204,7 +204,7 @@ proxyCacheMigration = true # # parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly # parseableMime: specifies mime-types that can be indexed but not on the fly -parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain +parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml parseableMimeTypes= parseableMimeTypes.CRAWLER= parseableMimeTypes.PROXY=