set a limit to CharBuffer object size to fight against bad/too large

content
pull/1/head
Michael Peter Christen 13 years ago
parent c602eaaf46
commit b7bb84c0bb

@ -59,6 +59,7 @@ import net.yacy.kelondro.util.MemoryControl;
public class ContentScraper extends AbstractScraper implements Scraper {
private static final String EMPTY_STRING = new String();
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "'".toCharArray();
@ -166,7 +167,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.bold = new ClusteredScoreMap<String>();
this.italic = new ClusteredScoreMap<String>();
this.li = new ArrayList<String>();
this.content = new CharBuffer(1024);
this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0f;
this.lat = 0.0f;

@ -82,7 +82,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
private static char[] genBlueLetters(int length) {
final CharBuffer bb = new CharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " <FONT COLOR=#0000FF>".toCharArray());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {

@ -96,7 +96,7 @@ public final class TransformerWriter extends Writer {
this.outStream = outStream;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new CharBuffer(initialBufferSize);
this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
@ -114,7 +114,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3);
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3);
bb.append('<');
if (!opening) {
bb.append('/');
@ -136,7 +136,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) {
final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5);
bb.append('<').append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
@ -157,7 +157,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar);
final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append('<').append(tagname);
if (tagoptsx != null) {
bb.appendSpace();
@ -175,7 +175,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
try {
@ -189,7 +189,7 @@ public final class TransformerWriter extends Writer {
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(final Properties prop, final char quotechar) {
final Enumeration<?> e = prop.propertyNames();
final CharBuffer bb = new CharBuffer(prop.size() * 40);
final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
@ -225,7 +225,7 @@ public final class TransformerWriter extends Writer {
if (opening) {
if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(content);
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
@ -236,7 +236,7 @@ public final class TransformerWriter extends Writer {
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(content);
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
@ -250,14 +250,14 @@ public final class TransformerWriter extends Writer {
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
// ok, start collecting
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(content);
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
// we ignore that thing and return it again

@ -50,39 +50,41 @@ import net.yacy.kelondro.util.FileUtils;
public class odtParser extends AbstractParser implements Parser {
public final static int MAX_DOCSIZE = 200 * 1024 * 1024;
public odtParser() {
super("OASIS OpenDocument V2 Text Document Parser");
SUPPORTED_EXTENSIONS.add("odt");
SUPPORTED_EXTENSIONS.add("ods");
SUPPORTED_EXTENSIONS.add("odp");
SUPPORTED_EXTENSIONS.add("odg");
SUPPORTED_EXTENSIONS.add("odc");
SUPPORTED_EXTENSIONS.add("odf");
SUPPORTED_EXTENSIONS.add("odb");
SUPPORTED_EXTENSIONS.add("odi");
SUPPORTED_EXTENSIONS.add("odm");
SUPPORTED_EXTENSIONS.add("ott");
SUPPORTED_EXTENSIONS.add("ots");
SUPPORTED_EXTENSIONS.add("otp");
SUPPORTED_EXTENSIONS.add("otg");
SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format
SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template");
SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template");
SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
SUPPORTED_MIME_TYPES.add("application/OOo-calc");
SUPPORTED_MIME_TYPES.add("application/OOo-writer");
this.SUPPORTED_EXTENSIONS.add("odt");
this.SUPPORTED_EXTENSIONS.add("ods");
this.SUPPORTED_EXTENSIONS.add("odp");
this.SUPPORTED_EXTENSIONS.add("odg");
this.SUPPORTED_EXTENSIONS.add("odc");
this.SUPPORTED_EXTENSIONS.add("odf");
this.SUPPORTED_EXTENSIONS.add("odb");
this.SUPPORTED_EXTENSIONS.add("odi");
this.SUPPORTED_EXTENSIONS.add("odm");
this.SUPPORTED_EXTENSIONS.add("ott");
this.SUPPORTED_EXTENSIONS.add("ots");
this.SUPPORTED_EXTENSIONS.add("otp");
this.SUPPORTED_EXTENSIONS.add("otg");
this.SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format
this.SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template");
this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template");
this.SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text");
this.SUPPORTED_MIME_TYPES.add("application/OOo-calc");
this.SUPPORTED_MIME_TYPES.add("application/OOo-writer");
}
private Document[] parse(final MultiProtocolURI location, final String mimeType,
@ -114,7 +116,7 @@ public class odtParser extends AbstractParser implements Parser {
if (entryName.equals("content.xml")) {
// create a writer for output
writer = new CharBuffer((int)zipEntry.getSize());
writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -191,6 +193,7 @@ public class odtParser extends AbstractParser implements Parser {
}
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {

@ -55,20 +55,20 @@ public class ooxmlParser extends AbstractParser implements Parser {
public ooxmlParser() {
super("Open Office XML Document Parser");
SUPPORTED_EXTENSIONS.add("docx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
SUPPORTED_EXTENSIONS.add("dotx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template");
SUPPORTED_EXTENSIONS.add("potx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template");
SUPPORTED_EXTENSIONS.add("ppsx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow");
SUPPORTED_EXTENSIONS.add("pptx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
SUPPORTED_EXTENSIONS.add("xlsx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
SUPPORTED_EXTENSIONS.add("xltx");
SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
this.SUPPORTED_EXTENSIONS.add("docx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
this.SUPPORTED_EXTENSIONS.add("dotx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template");
this.SUPPORTED_EXTENSIONS.add("potx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template");
this.SUPPORTED_EXTENSIONS.add("ppsx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow");
this.SUPPORTED_EXTENSIONS.add("pptx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
this.SUPPORTED_EXTENSIONS.add("xlsx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
this.SUPPORTED_EXTENSIONS.add("xltx");
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
}
private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException {
@ -100,7 +100,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
|| entryName.startsWith("xl/worksheets/sheet")) {
// create a writer for output
writer = new CharBuffer((int)zipEntry.getSize());
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -180,6 +180,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
}
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {

@ -67,6 +67,7 @@ public class pdfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
@ -125,12 +126,13 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
final CharBuffer writer = new CharBuffer();
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
try {
// create a writer for output
final PDFTextStripper stripper = new PDFTextStripper();
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final Thread t = new Thread() {
@Override
public void run() {
try {
stripper.writeText(pdfDoc, writer); // may throw a NPE

@ -188,7 +188,7 @@ public class URIMetadataRow implements URIMetadata {
final String dc_publisher,
final float lat,
final float lon) {
final CharBuffer s = new CharBuffer(360);
final CharBuffer s = new CharBuffer(20000, 360);
s.append(url.toNormalform(false, true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);

@ -42,34 +42,38 @@ public final class CharBuffer extends Writer {
private char[] buffer;
private int offset;
private int length;
private final int maximumLength;
public CharBuffer() {
public CharBuffer(final int maximumLength) {
this.buffer = new char[10];
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
}
public CharBuffer(final int initLength) {
public CharBuffer(final int maximumLength, final int initLength) {
this.buffer = new char[initLength];
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
}
public CharBuffer(final char[] bb) {
public CharBuffer(final int maximumLength, final char[] bb) {
this.buffer = bb;
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
}
public CharBuffer(final char[] bb, final int initLength) {
public CharBuffer(final int maximumLength, final char[] bb, final int initLength) {
this.buffer = new char[initLength];
System.arraycopy(bb, 0, this.buffer, 0, bb.length);
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
}
public CharBuffer(final char[] bb, final int of, final int le) {
public CharBuffer(final int maximumLength, final char[] bb, final int of, final int le) {
if (of * 2 > bb.length) {
this.buffer = new char[le];
System.arraycopy(bb, of, this.buffer, 0, le);
@ -80,17 +84,20 @@ public final class CharBuffer extends Writer {
this.length = le;
this.offset = of;
}
this.maximumLength = maximumLength;
}
public CharBuffer(final CharBuffer bb) {
this.buffer = bb.buffer;
this.length = bb.length;
this.offset = bb.offset;
this.maximumLength = bb.maximumLength;
}
public CharBuffer(final File f) throws IOException {
// initially fill the buffer with the content of a file
if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering");
this.maximumLength = Integer.MAX_VALUE;
this.length = 0;
this.buffer = new char[(int) f.length()*2];
@ -137,6 +144,7 @@ public final class CharBuffer extends Writer {
}
public void write(final char b) {
if (this.buffer.length > this.maximumLength) return;
if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
this.buffer[this.offset + this.length++] = b;
}
@ -148,6 +156,7 @@ public final class CharBuffer extends Writer {
@Override
public void write(final char[] bb, final int of, final int le) {
if (this.buffer.length > this.maximumLength) return;
if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
this.length += le;

Loading…
Cancel
Save