diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 0a772f27b..07b22f6ab 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -59,7 +59,8 @@ import net.yacy.kelondro.util.MemoryControl;
public class ContentScraper extends AbstractScraper implements Scraper {
private static final String EMPTY_STRING = new String();
-
+ public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
+
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "'".toCharArray();
@@ -166,7 +167,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.bold = new ClusteredScoreMap();
this.italic = new ClusteredScoreMap();
this.li = new ArrayList();
- this.content = new CharBuffer(1024);
+ this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0f;
this.lat = 0.0f;
diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java
index 1858613e3..c6d97bea4 100644
--- a/source/net/yacy/document/parser/html/ContentTransformer.java
+++ b/source/net/yacy/document/parser/html/ContentTransformer.java
@@ -37,7 +37,7 @@ import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
public class ContentTransformer extends AbstractTransformer implements Transformer {
-
+
// statics: for initialization of the HTMLFilterAbstractTransformer
private static final TreeSet linkTags0 = new TreeSet(ASCII.insensitiveASCIIComparator);
private static final TreeSet linkTags1 = new TreeSet(ASCII.insensitiveASCIIComparator);
@@ -82,7 +82,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
private static char[] genBlueLetters(int length) {
- final CharBuffer bb = new CharBuffer(" ".toCharArray());
+ final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " ".toCharArray());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {
@@ -106,7 +106,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
return false;
}
-
+
@Override
public char[] transformText(final char[] text) {
if (this.bluelist != null) {
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index 6670aa27d..e6dfe9c75 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -96,7 +96,7 @@ public final class TransformerWriter extends Writer {
this.outStream = outStream;
this.scraper = scraper;
this.transformer = transformer;
- this.buffer = new CharBuffer(initialBufferSize);
+ this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
@@ -114,7 +114,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
- final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3);
+ final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3);
bb.append('<');
if (!opening) {
bb.append('/');
@@ -136,7 +136,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) {
- final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
+ final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5);
bb.append('<').append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
@@ -157,7 +157,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar);
- final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
+ final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append('<').append(tagname);
if (tagoptsx != null) {
bb.appendSpace();
@@ -175,7 +175,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
- final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
+ final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
try {
@@ -189,7 +189,7 @@ public final class TransformerWriter extends Writer {
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(final Properties prop, final char quotechar) {
final Enumeration> e = prop.propertyNames();
- final CharBuffer bb = new CharBuffer(prop.size() * 40);
+ final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
@@ -225,7 +225,7 @@ public final class TransformerWriter extends Writer {
if (opening) {
if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
// this single tag is collected at once here
- final CharBuffer charBuffer = new CharBuffer(content);
+ final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
@@ -236,7 +236,7 @@ public final class TransformerWriter extends Writer {
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
- final CharBuffer scb = new CharBuffer(content);
+ final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
@@ -250,14 +250,14 @@ public final class TransformerWriter extends Writer {
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
// ok, start collecting
this.filterTag = tag;
- final CharBuffer scb = new CharBuffer(content);
+ final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
- if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
+ if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
// we ignore that thing and return it again
diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java
index b0b45d0e2..b86fb7b30 100644
--- a/source/net/yacy/document/parser/odtParser.java
+++ b/source/net/yacy/document/parser/odtParser.java
@@ -1,4 +1,4 @@
-//odtParser.java
+//odtParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@@ -50,71 +50,73 @@ import net.yacy.kelondro.util.FileUtils;
public class odtParser extends AbstractParser implements Parser {
- public odtParser() {
+ public final static int MAX_DOCSIZE = 200 * 1024 * 1024;
+
+ public odtParser() {
super("OASIS OpenDocument V2 Text Document Parser");
- SUPPORTED_EXTENSIONS.add("odt");
- SUPPORTED_EXTENSIONS.add("ods");
- SUPPORTED_EXTENSIONS.add("odp");
- SUPPORTED_EXTENSIONS.add("odg");
- SUPPORTED_EXTENSIONS.add("odc");
- SUPPORTED_EXTENSIONS.add("odf");
- SUPPORTED_EXTENSIONS.add("odb");
- SUPPORTED_EXTENSIONS.add("odi");
- SUPPORTED_EXTENSIONS.add("odm");
- SUPPORTED_EXTENSIONS.add("ott");
- SUPPORTED_EXTENSIONS.add("ots");
- SUPPORTED_EXTENSIONS.add("otp");
- SUPPORTED_EXTENSIONS.add("otg");
- SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format
- SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template");
- SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template");
- SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
- SUPPORTED_MIME_TYPES.add("application/OOo-calc");
- SUPPORTED_MIME_TYPES.add("application/OOo-writer");
+ this.SUPPORTED_EXTENSIONS.add("odt");
+ this.SUPPORTED_EXTENSIONS.add("ods");
+ this.SUPPORTED_EXTENSIONS.add("odp");
+ this.SUPPORTED_EXTENSIONS.add("odg");
+ this.SUPPORTED_EXTENSIONS.add("odc");
+ this.SUPPORTED_EXTENSIONS.add("odf");
+ this.SUPPORTED_EXTENSIONS.add("odb");
+ this.SUPPORTED_EXTENSIONS.add("odi");
+ this.SUPPORTED_EXTENSIONS.add("odm");
+ this.SUPPORTED_EXTENSIONS.add("ott");
+ this.SUPPORTED_EXTENSIONS.add("ots");
+ this.SUPPORTED_EXTENSIONS.add("otp");
+ this.SUPPORTED_EXTENSIONS.add("otg");
+ this.SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format
+ this.SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template");
+ this.SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
+ this.SUPPORTED_MIME_TYPES.add("application/OOo-calc");
+ this.SUPPORTED_MIME_TYPES.add("application/OOo-writer");
}
-
+
private Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final File dest)
throws Parser.Failure, InterruptedException {
-
+
CharBuffer writer = null;
- try {
+ try {
String docDescription = null;
String docKeywordStr = null;
String docShortTitle = null;
String docLongTitle = null;
String docAuthor = null;
String docLanguage = null;
-
+
// opening the file as zip file
final ZipFile zipFile = new ZipFile(dest);
final Enumeration extends ZipEntry> zipEnum = zipFile.entries();
final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
-
+
// looping through all containing files
while (zipEnum.hasMoreElements()) {
-
+
// getting the next zip file entry
final ZipEntry zipEntry= zipEnum.nextElement();
final String entryName = zipEntry.getName();
-
+
// content.xml contains the document content in xml format
if (entryName.equals("content.xml")) {
-
+
// create a writer for output
- writer = new CharBuffer((int)zipEntry.getSize());
+ writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@@ -142,22 +144,22 @@ public class odtParser extends AbstractParser implements Parser {
docLanguage = metaData.getLanguage();
}
}
-
+
// make the languages set
final Set languages = new HashSet(1);
if (docLanguage != null) languages.add(docLanguage);
-
+
// if there is no title availabe we generate one
if (docLongTitle == null) {
if (docShortTitle != null) {
docLongTitle = docShortTitle;
- }
- }
-
+ }
+ }
+
// split the keywords
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
-
+
// create the parser document
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
@@ -173,41 +175,42 @@ public class odtParser extends AbstractParser implements Parser {
"",
null,
docDescription,
- 0.0f, 0.0f,
+ 0.0f, 0.0f,
contentBytes,
null,
null,
null,
false)};
return docs;
- } catch (final Exception e) {
+ } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
-
+
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
-
- throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
+
+ throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
}
}
-
+
+ @Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile
dest = File.createTempFile("OpenDocument", ".odt");
dest.deleteOnExit();
-
+
// copying the stream into a file
FileUtils.copy(source, dest);
-
+
// parsing the content
return parse(location, mimeType, charset, dest);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
-
- throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
+
+ throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
} finally {
if (dest != null) FileUtils.deletedelete(dest);
}
diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java
index 195b3f7f7..fdc9192ff 100644
--- a/source/net/yacy/document/parser/ooxmlParser.java
+++ b/source/net/yacy/document/parser/ooxmlParser.java
@@ -1,4 +1,4 @@
-//odtParser.java
+//odtParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@@ -53,61 +53,61 @@ import net.yacy.kelondro.util.FileUtils;
public class ooxmlParser extends AbstractParser implements Parser {
- public ooxmlParser() {
- super("Open Office XML Document Parser");
- SUPPORTED_EXTENSIONS.add("docx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- SUPPORTED_EXTENSIONS.add("dotx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template");
- SUPPORTED_EXTENSIONS.add("potx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template");
- SUPPORTED_EXTENSIONS.add("ppsx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow");
- SUPPORTED_EXTENSIONS.add("pptx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
- SUPPORTED_EXTENSIONS.add("xlsx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- SUPPORTED_EXTENSIONS.add("xltx");
- SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
+ public ooxmlParser() {
+ super("Open Office XML Document Parser");
+ this.SUPPORTED_EXTENSIONS.add("docx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ this.SUPPORTED_EXTENSIONS.add("dotx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template");
+ this.SUPPORTED_EXTENSIONS.add("potx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template");
+ this.SUPPORTED_EXTENSIONS.add("ppsx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow");
+ this.SUPPORTED_EXTENSIONS.add("pptx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ this.SUPPORTED_EXTENSIONS.add("xlsx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ this.SUPPORTED_EXTENSIONS.add("xltx");
+ this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
}
-
+
private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException {
-
+
CharBuffer writer = null;
- try {
+ try {
String docDescription = null;
String docKeywordStr = null;
String docShortTitle = null;
String docLongTitle = null;
String docAuthor = null;
String docLanguage = null;
-
+
// opening the file as zip file
final ZipFile zipFile= new ZipFile(dest);
final Enumeration extends ZipEntry> zipEnum = zipFile.entries();
final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
-
+
// looping through all containing files
while (zipEnum.hasMoreElements()) {
-
+
// get next zip file entry
final ZipEntry zipEntry= zipEnum.nextElement();
final String entryName = zipEntry.getName();
-
+
// content.xml contains the document content in xml format
if (entryName.equals("word/document.xml")
|| entryName.startsWith("ppt/slides/slide")
|| entryName.startsWith("xl/worksheets/sheet")) {
-
+
// create a writer for output
- writer = new CharBuffer((int)zipEntry.getSize());
+ writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
-
+
// close readers and writers
} finally {
zipFileEntryStream.close();
@@ -129,21 +129,21 @@ public class ooxmlParser extends AbstractParser implements Parser {
docLanguage = metaData.getLanguage();
}
}
-
+
// make the languages set
final Set languages = new HashSet(1);
if (docLanguage != null && docLanguage.length() == 0)
languages.add(docLanguage);
-
+
// if there is no title availabe we generate one
if ((docLongTitle == null || docLongTitle.length() == 0) && (docShortTitle != null)) {
docLongTitle = docShortTitle;
- }
-
+ }
+
// split the keywords
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
-
+
// create the parser document
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
@@ -159,44 +159,45 @@ public class ooxmlParser extends AbstractParser implements Parser {
"",
null,
docDescription,
- 0.0f, 0.0f,
+ 0.0f, 0.0f,
contentBytes,
null,
null,
null,
false)};
return docs;
- } catch (final Exception e) {
+ } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
-
+
// close the writer
if (writer != null) try {
writer.close();
} catch (final Exception ex) {/* ignore this */}
Log.logException(e);
- throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
+ throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
}
}
-
+
+ @Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile
dest = File.createTempFile("OpenDocument", ".odt");
dest.deleteOnExit();
-
+
// copying the stream into a file
FileUtils.copy(source, dest);
-
+
// parsing the content
return parse(location, mimeType, charset, dest);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
-
- throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
+
+ throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);
} finally {
if (dest != null) FileUtils.deletedelete(dest);
}
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 2d5dd75c7..11db922fc 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -67,6 +67,7 @@ public class pdfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
}
+ @Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
@@ -125,12 +126,13 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
- final CharBuffer writer = new CharBuffer();
+ final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
try {
// create a writer for output
final PDFTextStripper stripper = new PDFTextStripper();
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final Thread t = new Thread() {
+ @Override
public void run() {
try {
stripper.writeText(pdfDoc, writer); // may throw a NPE
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
index 1022a7c51..8819aa07e 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
@@ -188,7 +188,7 @@ public class URIMetadataRow implements URIMetadata {
final String dc_publisher,
final float lat,
final float lon) {
- final CharBuffer s = new CharBuffer(360);
+ final CharBuffer s = new CharBuffer(20000, 360);
s.append(url.toNormalform(false, true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
@@ -381,23 +381,23 @@ public class URIMetadataRow implements URIMetadata {
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
}
-
+
public DigestURI url() {
return this.metadata().url();
}
-
+
public String dc_title() {
return this.metadata().dc_title();
}
-
+
public String dc_creator() {
return this.metadata().dc_creator();
}
-
+
public String dc_publisher() {
return this.metadata().dc_publisher();
}
-
+
public String dc_subject() {
return this.metadata().dc_subject();
}
@@ -409,7 +409,7 @@ public class URIMetadataRow implements URIMetadata {
public float lon() {
return this.metadata().lon();
}
-
+
private Components metadata() {
// avoid double computation of metadata elements
if (this.comp != null) return this.comp;
diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java
index 70fe1b3f3..85d5767ee 100644
--- a/source/net/yacy/kelondro/io/CharBuffer.java
+++ b/source/net/yacy/kelondro/io/CharBuffer.java
@@ -1,4 +1,4 @@
-// serverCharBuffer.java
+// serverCharBuffer.java
// ---------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
@@ -34,42 +34,46 @@ import java.util.Properties;
import net.yacy.cora.document.UTF8;
public final class CharBuffer extends Writer {
-
+
public static final char singlequote = '\'';
public static final char doublequote = '"';
public static final char equal = '=';
-
+
private char[] buffer;
private int offset;
private int length;
+ private final int maximumLength;
-
- public CharBuffer() {
+ public CharBuffer(final int maximumLength) {
this.buffer = new char[10];
this.length = 0;
this.offset = 0;
+ this.maximumLength = maximumLength;
}
-
- public CharBuffer(final int initLength) {
+
+ public CharBuffer(final int maximumLength, final int initLength) {
this.buffer = new char[initLength];
this.length = 0;
this.offset = 0;
- }
-
- public CharBuffer(final char[] bb) {
+ this.maximumLength = maximumLength;
+ }
+
+ public CharBuffer(final int maximumLength, final char[] bb) {
this.buffer = bb;
this.length = bb.length;
this.offset = 0;
+ this.maximumLength = maximumLength;
}
- public CharBuffer(final char[] bb, final int initLength) {
+ public CharBuffer(final int maximumLength, final char[] bb, final int initLength) {
this.buffer = new char[initLength];
System.arraycopy(bb, 0, this.buffer, 0, bb.length);
this.length = bb.length;
this.offset = 0;
+ this.maximumLength = maximumLength;
}
-
- public CharBuffer(final char[] bb, final int of, final int le) {
+
+ public CharBuffer(final int maximumLength, final char[] bb, final int of, final int le) {
if (of * 2 > bb.length) {
this.buffer = new char[le];
System.arraycopy(bb, of, this.buffer, 0, le);
@@ -80,17 +84,20 @@ public final class CharBuffer extends Writer {
this.length = le;
this.offset = of;
}
+ this.maximumLength = maximumLength;
}
public CharBuffer(final CharBuffer bb) {
this.buffer = bb.buffer;
this.length = bb.length;
this.offset = bb.offset;
+ this.maximumLength = bb.maximumLength;
}
public CharBuffer(final File f) throws IOException {
// initially fill the buffer with the content of a file
if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering");
+ this.maximumLength = Integer.MAX_VALUE;
this.length = 0;
this.buffer = new char[(int) f.length()*2];
@@ -117,7 +124,7 @@ public final class CharBuffer extends Writer {
this.length = 0;
this.offset = 0;
}
-
+
public int length() {
return this.length;
}
@@ -135,19 +142,21 @@ public final class CharBuffer extends Writer {
public void write(final int b) {
write((char)b);
}
-
+
public void write(final char b) {
+ if (this.buffer.length > this.maximumLength) return;
if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
this.buffer[this.offset + this.length++] = b;
}
-
+
@Override
public void write(final char[] bb) {
write(bb, 0, bb.length);
}
-
+
@Override
public void write(final char[] bb, final int of, final int le) {
+ if (this.buffer.length > this.maximumLength) return;
if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
this.length += le;
@@ -156,7 +165,7 @@ public final class CharBuffer extends Writer {
private static final char SPACE = ' ';
private static final char CR = (char) 13;
private static final char LF = (char) 10;
-
+
public CharBuffer appendSpace() {
write(SPACE);
return this;
@@ -189,29 +198,29 @@ public final class CharBuffer extends Writer {
@Override
public CharBuffer append(final char c) {
- write(c);
+ write(c);
return this;
}
-
+
public CharBuffer append(final String s) {
final char[] temp = new char[s.length()];
s.getChars(0, temp.length, temp, 0);
write(temp);
return this;
- }
-
+ }
+
public CharBuffer append(final String s, final int off, final int len) {
final char[] temp = new char[len];
s.getChars(off, (off + len), temp, 0);
write(temp);
return this;
}
-
+
public CharBuffer append(final CharBuffer bb) {
write(bb.buffer, bb.offset, bb.length);
return this;
}
-
+
public char charAt(final int pos) {
if (pos < 0) throw new IndexOutOfBoundsException();
if (pos > this.length) throw new IndexOutOfBoundsException();
@@ -227,7 +236,7 @@ public final class CharBuffer extends Writer {
System.arraycopy(this.buffer, this.offset + pos + 1, this.buffer, this.offset + pos, this.length - pos - 1);
}
}
-
+
public int indexOf(final char b) {
return indexOf(b, 0);
}
@@ -247,18 +256,18 @@ public final class CharBuffer extends Writer {
loop: for (int i = start; i <= this.length - bs.length; i++) {
// first test only first char
if (this.buffer[this.offset + i] != bs[0]) continue loop;
-
+
// then test all remaining char
for (int j = 1; j < bs.length; j++) {
if (this.buffer[this.offset + i + j] != bs[j]) continue loop;
}
-
+
// found hit
return i;
}
return -1;
}
-
+
public static int indexOf(final char[] b, final char c) {
return indexOf(b, 0, c);
}
@@ -267,7 +276,7 @@ public final class CharBuffer extends Writer {
for (int i = offset; i < b.length; i++) if (b[i] == c) return i;
return -1;
}
-
+
public static int indexOf(final char[] b, final char[] s) {
return indexOf(b, 0, s);
}
@@ -277,12 +286,12 @@ public final class CharBuffer extends Writer {
loop: for (int i = start; i <= b.length - bs.length; i++) {
// first test only first char
if (b[i] != bs[0]) continue loop;
-
+
// then test all remaining char
for (int j = 1; j < bs.length; j++) {
if (b[i + j] != bs[j]) continue loop;
}
-
+
// found hit
return i;
}
@@ -301,7 +310,7 @@ public final class CharBuffer extends Writer {
}
return true;
}
-
+
public char[] getChars() {
return getChars(0);
}
@@ -318,7 +327,7 @@ public final class CharBuffer extends Writer {
System.arraycopy(this.buffer, this.offset + start, tmp, 0, end - start);
return tmp;
}
-
+
public byte[] getBytes() {
return UTF8.getBytes(new String(this.buffer, this.offset, this.length));
}
@@ -363,7 +372,7 @@ public final class CharBuffer extends Writer {
}
return true;
}
-
+
public int whitespaceStart(final boolean includeNonLetterBytes) {
// returns number of whitespace char at the beginning of text
if (includeNonLetterBytes) {
@@ -377,7 +386,7 @@ public final class CharBuffer extends Writer {
}
return this.length;
}
-
+
public int whitespaceEnd(final boolean includeNonLetterBytes) {
// returns position of whitespace at the end of text
if (includeNonLetterBytes) {
@@ -391,8 +400,8 @@ public final class CharBuffer extends Writer {
}
return 0;
}
-
-
+
+
@Override
public String toString() {
return new String(this.buffer, this.offset, this.length);
@@ -453,11 +462,11 @@ public final class CharBuffer extends Writer {
}
return p;
}
-
+
public static boolean equals(final char[] buffer, final char[] pattern) {
return equals(buffer, 0, pattern);
}
-
+
public static boolean equals(final char[] buffer, final int offset, final char[] pattern) {
// compares two char arrays: true, if pattern appears completely at offset position
if (buffer.length < offset + pattern.length) return false;
@@ -468,20 +477,20 @@ public final class CharBuffer extends Writer {
public void reset() {
this.length = 0;
this.offset = 0;
- }
-
- public void reset(final int newSize) {
+ }
+
+ public void reset(final int newSize) {
this.resize(newSize);
this.reset();
- }
-
+ }
+
public void resize(final int newSize) {
if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize);
final char[] v = new char[newSize];
System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize);
- this.buffer = v;
+ this.buffer = v;
}
-
+
public char toCharArray()[] {
final char[] newbuf = new char[this.length];
System.arraycopy(this.buffer, 0, newbuf, 0, this.length);
@@ -490,12 +499,12 @@ public final class CharBuffer extends Writer {
@Override
public void close() throws IOException {
- this.buffer = null; // assist with garbage collection
+ this.buffer = null; // assist with garbage collection
}
@Override
public void flush() throws IOException {
- // TODO Auto-generated method stub
- }
-
+ // TODO Auto-generated method stub
+ }
+
}
\ No newline at end of file