diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 0a772f27b..07b22f6ab 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -59,7 +59,8 @@ import net.yacy.kelondro.util.MemoryControl; public class ContentScraper extends AbstractScraper implements Scraper { private static final String EMPTY_STRING = new String(); - + public static final int MAX_DOCSIZE = 40 * 1024 * 1024; + private final char degree = '\u00B0'; private final char[] minuteCharsHTML = "'".toCharArray(); @@ -166,7 +167,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.bold = new ClusteredScoreMap(); this.italic = new ClusteredScoreMap(); this.li = new ArrayList(); - this.content = new CharBuffer(1024); + this.content = new CharBuffer(MAX_DOCSIZE, 1024); this.htmlFilterEventListeners = new EventListenerList(); this.lon = 0.0f; this.lat = 0.0f; diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java index 1858613e3..c6d97bea4 100644 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -37,7 +37,7 @@ import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; public class ContentTransformer extends AbstractTransformer implements Transformer { - + // statics: for initialization of the HTMLFilterAbstractTransformer private static final TreeSet linkTags0 = new TreeSet(ASCII.insensitiveASCIIComparator); private static final TreeSet linkTags1 = new TreeSet(ASCII.insensitiveASCIIComparator); @@ -82,7 +82,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform } private static char[] genBlueLetters(int length) { - final CharBuffer bb = new CharBuffer(" ".toCharArray()); + final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, " ".toCharArray()); length = length / 2; if (length > 10) length = 7; while (length-- > 0) { @@ -106,7 +106,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform } return false; } - + @Override public char[] transformText(final char[] text) { if (this.bluelist != null) { diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 6670aa27d..e6dfe9c75 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -96,7 +96,7 @@ public final class TransformerWriter extends Writer { this.outStream = outStream; this.scraper = scraper; this.transformer = transformer; - this.buffer = new CharBuffer(initialBufferSize); + this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize); this.filterTag = null; this.filterOpts = null; this.filterCont = null; @@ -114,7 +114,7 @@ public final class TransformerWriter extends Writer { } public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) { - final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3); + final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3); bb.append('<'); if (!opening) { bb.append('/'); @@ -136,7 +136,7 @@ public final class TransformerWriter extends Writer { } public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) { - final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); + final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5); bb.append('<').append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) @@ -157,7 +157,7 @@ public final class TransformerWriter extends Writer { public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) { final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar); - final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); + final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); bb.append('<').append(tagname); if (tagoptsx != null) { bb.appendSpace(); @@ -175,7 +175,7 @@ public final class TransformerWriter extends Writer { public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { final char[] gt0 = genTag0(tagname, tagopts, quotechar); - final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); + final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append('<').append('/').append(tagname).append('>'); final char[] result = cb.getChars(); try { @@ -189,7 +189,7 @@ public final class TransformerWriter extends Writer { // a helper method for pretty-printing of properties for html tags public static char[] genOpts(final Properties prop, final char quotechar) { final Enumeration e = prop.propertyNames(); - final CharBuffer bb = new CharBuffer(prop.size() * 40); + final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40); String key; while (e.hasMoreElements()) { key = (String) e.nextElement(); @@ -225,7 +225,7 @@ public final class TransformerWriter extends Writer { if (opening) { if ((this.scraper != null) && (this.scraper.isTag0(tag))) { // this single tag is collected at once here - final CharBuffer charBuffer = new CharBuffer(content); + final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.scraper.scrapeTag0(tag, charBuffer.propParser()); try { charBuffer.close(); @@ -236,7 +236,7 @@ public final class TransformerWriter extends Writer { } if ((this.transformer != null) && (this.transformer.isTag0(tag))) { // this single tag is collected at once here - final CharBuffer scb = new CharBuffer(content); + final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); try { return this.transformer.transformTag0(tag, scb.propParser(), quotechar); } finally { @@ -250,14 +250,14 @@ public final class TransformerWriter extends Writer { ((this.transformer != null) && (this.transformer.isTag1(tag)))) { // ok, start collecting this.filterTag = tag; - final CharBuffer scb = new CharBuffer(content); + final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.filterOpts = scb.propParser(); try { scb.close(); } catch (final IOException e) { Log.logException(e); } - if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset(); + if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); return new char[0]; } else { // we ignore that thing and return it again diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index b0b45d0e2..b86fb7b30 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -1,4 +1,4 @@ -//odtParser.java +//odtParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -50,71 +50,73 @@ import net.yacy.kelondro.util.FileUtils; public class odtParser extends AbstractParser implements Parser { - public odtParser() { + public final static int MAX_DOCSIZE = 200 * 1024 * 1024; + + public odtParser() { super("OASIS OpenDocument V2 Text Document Parser"); - SUPPORTED_EXTENSIONS.add("odt"); - SUPPORTED_EXTENSIONS.add("ods"); - SUPPORTED_EXTENSIONS.add("odp"); - SUPPORTED_EXTENSIONS.add("odg"); - SUPPORTED_EXTENSIONS.add("odc"); - SUPPORTED_EXTENSIONS.add("odf"); - SUPPORTED_EXTENSIONS.add("odb"); - SUPPORTED_EXTENSIONS.add("odi"); - SUPPORTED_EXTENSIONS.add("odm"); - SUPPORTED_EXTENSIONS.add("ott"); - SUPPORTED_EXTENSIONS.add("ots"); - SUPPORTED_EXTENSIONS.add("otp"); - SUPPORTED_EXTENSIONS.add("otg"); - SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format - SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template"); - SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template"); - SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text"); - SUPPORTED_MIME_TYPES.add("application/OOo-calc"); - SUPPORTED_MIME_TYPES.add("application/OOo-writer"); + this.SUPPORTED_EXTENSIONS.add("odt"); + this.SUPPORTED_EXTENSIONS.add("ods"); + this.SUPPORTED_EXTENSIONS.add("odp"); + this.SUPPORTED_EXTENSIONS.add("odg"); + this.SUPPORTED_EXTENSIONS.add("odc"); + this.SUPPORTED_EXTENSIONS.add("odf"); + this.SUPPORTED_EXTENSIONS.add("odb"); + this.SUPPORTED_EXTENSIONS.add("odi"); + this.SUPPORTED_EXTENSIONS.add("odm"); + this.SUPPORTED_EXTENSIONS.add("ott"); + this.SUPPORTED_EXTENSIONS.add("ots"); + this.SUPPORTED_EXTENSIONS.add("otp"); + this.SUPPORTED_EXTENSIONS.add("otg"); + this.SUPPORTED_EXTENSIONS.add("sxw"); // Star Office Writer file format + this.SUPPORTED_EXTENSIONS.add("sxc"); // Star Office Calc file format + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.chart"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.formula"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.database"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.image"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-master"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text-template"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet-template"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation-template"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.graphics-template"); + this.SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text"); + this.SUPPORTED_MIME_TYPES.add("application/OOo-calc"); + this.SUPPORTED_MIME_TYPES.add("application/OOo-writer"); } - + private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException { - + CharBuffer writer = null; - try { + try { String docDescription = null; String docKeywordStr = null; String docShortTitle = null; String docLongTitle = null; String docAuthor = null; String docLanguage = null; - + // opening the file as zip file final ZipFile zipFile = new ZipFile(dest); final Enumeration zipEnum = zipFile.entries(); final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); - + // looping through all containing files while (zipEnum.hasMoreElements()) { - + // getting the next zip file entry final ZipEntry zipEntry= zipEnum.nextElement(); final String entryName = zipEntry.getName(); - + // content.xml contains the document content in xml format if (entryName.equals("content.xml")) { - + // create a writer for output - writer = new CharBuffer((int)zipEntry.getSize()); + writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize()); try { // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); @@ -142,22 +144,22 @@ public class odtParser extends AbstractParser implements Parser { docLanguage = metaData.getLanguage(); } } - + // make the languages set final Set languages = new HashSet(1); if (docLanguage != null) languages.add(docLanguage); - + // if there is no title availabe we generate one if (docLongTitle == null) { if (docShortTitle != null) { docLongTitle = docShortTitle; - } - } - + } + } + // split the keywords String[] docKeywords = null; if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - + // create the parser document Document[] docs = null; final byte[] contentBytes = UTF8.getBytes(writer.toString()); @@ -173,41 +175,42 @@ public class odtParser extends AbstractParser implements Parser { "", null, docDescription, - 0.0f, 0.0f, + 0.0f, 0.0f, contentBytes, null, null, null, false)}; return docs; - } catch (final Exception e) { + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - + // close the writer if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */} - - throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); + + throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); } } - + + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile dest = File.createTempFile("OpenDocument", ".odt"); dest.deleteOnExit(); - + // copying the stream into a file FileUtils.copy(source, dest); - + // parsing the content return parse(location, mimeType, charset, dest); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - - throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); + + throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); } finally { if (dest != null) FileUtils.deletedelete(dest); } diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 195b3f7f7..fdc9192ff 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -1,4 +1,4 @@ -//odtParser.java +//odtParser.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net @@ -53,61 +53,61 @@ import net.yacy.kelondro.util.FileUtils; public class ooxmlParser extends AbstractParser implements Parser { - public ooxmlParser() { - super("Open Office XML Document Parser"); - SUPPORTED_EXTENSIONS.add("docx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - SUPPORTED_EXTENSIONS.add("dotx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template"); - SUPPORTED_EXTENSIONS.add("potx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template"); - SUPPORTED_EXTENSIONS.add("ppsx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow"); - SUPPORTED_EXTENSIONS.add("pptx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation"); - SUPPORTED_EXTENSIONS.add("xlsx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - SUPPORTED_EXTENSIONS.add("xltx"); - SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template"); + public ooxmlParser() { + super("Open Office XML Document Parser"); + this.SUPPORTED_EXTENSIONS.add("docx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + this.SUPPORTED_EXTENSIONS.add("dotx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template"); + this.SUPPORTED_EXTENSIONS.add("potx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template"); + this.SUPPORTED_EXTENSIONS.add("ppsx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow"); + this.SUPPORTED_EXTENSIONS.add("pptx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation"); + this.SUPPORTED_EXTENSIONS.add("xlsx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + this.SUPPORTED_EXTENSIONS.add("xltx"); + this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template"); } - + private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException { - + CharBuffer writer = null; - try { + try { String docDescription = null; String docKeywordStr = null; String docShortTitle = null; String docLongTitle = null; String docAuthor = null; String docLanguage = null; - + // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); final Enumeration zipEnum = zipFile.entries(); final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); - + // looping through all containing files while (zipEnum.hasMoreElements()) { - + // get next zip file entry final ZipEntry zipEntry= zipEnum.nextElement(); final String entryName = zipEntry.getName(); - + // content.xml contains the document content in xml format if (entryName.equals("word/document.xml") || entryName.startsWith("ppt/slides/slide") || entryName.startsWith("xl/worksheets/sheet")) { - + // create a writer for output - writer = new CharBuffer((int)zipEntry.getSize()); + writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize()); try { // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); try { final SAXParser saxParser = saxParserFactory.newSAXParser(); saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); - + // close readers and writers } finally { zipFileEntryStream.close(); @@ -129,21 +129,21 @@ public class ooxmlParser extends AbstractParser implements Parser { docLanguage = metaData.getLanguage(); } } - + // make the languages set final Set languages = new HashSet(1); if (docLanguage != null && docLanguage.length() == 0) languages.add(docLanguage); - + // if there is no title availabe we generate one if ((docLongTitle == null || docLongTitle.length() == 0) && (docShortTitle != null)) { docLongTitle = docShortTitle; - } - + } + // split the keywords String[] docKeywords = null; if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - + // create the parser document Document[] docs = null; final byte[] contentBytes = UTF8.getBytes(writer.toString()); @@ -159,44 +159,45 @@ public class ooxmlParser extends AbstractParser implements Parser { "", null, docDescription, - 0.0f, 0.0f, + 0.0f, 0.0f, contentBytes, null, null, null, false)}; return docs; - } catch (final Exception e) { + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - + // close the writer if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */} Log.logException(e); - throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); + throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); } } - + + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile dest = File.createTempFile("OpenDocument", ".odt"); dest.deleteOnExit(); - + // copying the stream into a file FileUtils.copy(source, dest); - + // parsing the content return parse(location, mimeType, charset, dest); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; - - throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); + + throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); } finally { if (dest != null) FileUtils.deletedelete(dest); } diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 2d5dd75c7..11db922fc 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -67,6 +67,7 @@ public class pdfParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("text/x-pdf"); } + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser @@ -125,12 +126,13 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null || docTitle.length() == 0) { docTitle = MultiProtocolURI.unescape(location.getFileName()); } - final CharBuffer writer = new CharBuffer(); + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); try { // create a writer for output final PDFTextStripper stripper = new PDFTextStripper(); // we start the pdf parsing in a separate thread to ensure that it can be terminated final Thread t = new Thread() { + @Override public void run() { try { stripper.writeText(pdfDoc, writer); // may throw a NPE diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 1022a7c51..8819aa07e 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -188,7 +188,7 @@ public class URIMetadataRow implements URIMetadata { final String dc_publisher, final float lat, final float lon) { - final CharBuffer s = new CharBuffer(360); + final CharBuffer s = new CharBuffer(20000, 360); s.append(url.toNormalform(false, true)).appendLF(); s.append(dc_title).appendLF(); if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); @@ -381,23 +381,23 @@ public class URIMetadataRow implements URIMetadata { public boolean matches(final Pattern matcher) { return this.metadata().matches(matcher); } - + public DigestURI url() { return this.metadata().url(); } - + public String dc_title() { return this.metadata().dc_title(); } - + public String dc_creator() { return this.metadata().dc_creator(); } - + public String dc_publisher() { return this.metadata().dc_publisher(); } - + public String dc_subject() { return this.metadata().dc_subject(); } @@ -409,7 +409,7 @@ public class URIMetadataRow implements URIMetadata { public float lon() { return this.metadata().lon(); } - + private Components metadata() { // avoid double computation of metadata elements if (this.comp != null) return this.comp; diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index 70fe1b3f3..85d5767ee 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -1,4 +1,4 @@ -// serverCharBuffer.java +// serverCharBuffer.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -34,42 +34,46 @@ import java.util.Properties; import net.yacy.cora.document.UTF8; public final class CharBuffer extends Writer { - + public static final char singlequote = '\''; public static final char doublequote = '"'; public static final char equal = '='; - + private char[] buffer; private int offset; private int length; + private final int maximumLength; - - public CharBuffer() { + public CharBuffer(final int maximumLength) { this.buffer = new char[10]; this.length = 0; this.offset = 0; + this.maximumLength = maximumLength; } - - public CharBuffer(final int initLength) { + + public CharBuffer(final int maximumLength, final int initLength) { this.buffer = new char[initLength]; this.length = 0; this.offset = 0; - } - - public CharBuffer(final char[] bb) { + this.maximumLength = maximumLength; + } + + public CharBuffer(final int maximumLength, final char[] bb) { this.buffer = bb; this.length = bb.length; this.offset = 0; + this.maximumLength = maximumLength; } - public CharBuffer(final char[] bb, final int initLength) { + public CharBuffer(final int maximumLength, final char[] bb, final int initLength) { this.buffer = new char[initLength]; System.arraycopy(bb, 0, this.buffer, 0, bb.length); this.length = bb.length; this.offset = 0; + this.maximumLength = maximumLength; } - - public CharBuffer(final char[] bb, final int of, final int le) { + + public CharBuffer(final int maximumLength, final char[] bb, final int of, final int le) { if (of * 2 > bb.length) { this.buffer = new char[le]; System.arraycopy(bb, of, this.buffer, 0, le); @@ -80,17 +84,20 @@ public final class CharBuffer extends Writer { this.length = le; this.offset = of; } + this.maximumLength = maximumLength; } public CharBuffer(final CharBuffer bb) { this.buffer = bb.buffer; this.length = bb.length; this.offset = bb.offset; + this.maximumLength = bb.maximumLength; } public CharBuffer(final File f) throws IOException { // initially fill the buffer with the content of a file if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering"); + this.maximumLength = Integer.MAX_VALUE; this.length = 0; this.buffer = new char[(int) f.length()*2]; @@ -117,7 +124,7 @@ public final class CharBuffer extends Writer { this.length = 0; this.offset = 0; } - + public int length() { return this.length; } @@ -135,19 +142,21 @@ public final class CharBuffer extends Writer { public void write(final int b) { write((char)b); } - + public void write(final char b) { + if (this.buffer.length > this.maximumLength) return; if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1); this.buffer[this.offset + this.length++] = b; } - + @Override public void write(final char[] bb) { write(bb, 0, bb.length); } - + @Override public void write(final char[] bb, final int of, final int le) { + if (this.buffer.length > this.maximumLength) return; if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le); System.arraycopy(bb, of, this.buffer, this.offset + this.length, le); this.length += le; @@ -156,7 +165,7 @@ public final class CharBuffer extends Writer { private static final char SPACE = ' '; private static final char CR = (char) 13; private static final char LF = (char) 10; - + public CharBuffer appendSpace() { write(SPACE); return this; @@ -189,29 +198,29 @@ public final class CharBuffer extends Writer { @Override public CharBuffer append(final char c) { - write(c); + write(c); return this; } - + public CharBuffer append(final String s) { final char[] temp = new char[s.length()]; s.getChars(0, temp.length, temp, 0); write(temp); return this; - } - + } + public CharBuffer append(final String s, final int off, final int len) { final char[] temp = new char[len]; s.getChars(off, (off + len), temp, 0); write(temp); return this; } - + public CharBuffer append(final CharBuffer bb) { write(bb.buffer, bb.offset, bb.length); return this; } - + public char charAt(final int pos) { if (pos < 0) throw new IndexOutOfBoundsException(); if (pos > this.length) throw new IndexOutOfBoundsException(); @@ -227,7 +236,7 @@ public final class CharBuffer extends Writer { System.arraycopy(this.buffer, this.offset + pos + 1, this.buffer, this.offset + pos, this.length - pos - 1); } } - + public int indexOf(final char b) { return indexOf(b, 0); } @@ -247,18 +256,18 @@ public final class CharBuffer extends Writer { loop: for (int i = start; i <= this.length - bs.length; i++) { // first test only first char if (this.buffer[this.offset + i] != bs[0]) continue loop; - + // then test all remaining char for (int j = 1; j < bs.length; j++) { if (this.buffer[this.offset + i + j] != bs[j]) continue loop; } - + // found hit return i; } return -1; } - + public static int indexOf(final char[] b, final char c) { return indexOf(b, 0, c); } @@ -267,7 +276,7 @@ public final class CharBuffer extends Writer { for (int i = offset; i < b.length; i++) if (b[i] == c) return i; return -1; } - + public static int indexOf(final char[] b, final char[] s) { return indexOf(b, 0, s); } @@ -277,12 +286,12 @@ public final class CharBuffer extends Writer { loop: for (int i = start; i <= b.length - bs.length; i++) { // first test only first char if (b[i] != bs[0]) continue loop; - + // then test all remaining char for (int j = 1; j < bs.length; j++) { if (b[i + j] != bs[j]) continue loop; } - + // found hit return i; } @@ -301,7 +310,7 @@ public final class CharBuffer extends Writer { } return true; } - + public char[] getChars() { return getChars(0); } @@ -318,7 +327,7 @@ public final class CharBuffer extends Writer { System.arraycopy(this.buffer, this.offset + start, tmp, 0, end - start); return tmp; } - + public byte[] getBytes() { return UTF8.getBytes(new String(this.buffer, this.offset, this.length)); } @@ -363,7 +372,7 @@ public final class CharBuffer extends Writer { } return true; } - + public int whitespaceStart(final boolean includeNonLetterBytes) { // returns number of whitespace char at the beginning of text if (includeNonLetterBytes) { @@ -377,7 +386,7 @@ public final class CharBuffer extends Writer { } return this.length; } - + public int whitespaceEnd(final boolean includeNonLetterBytes) { // returns position of whitespace at the end of text if (includeNonLetterBytes) { @@ -391,8 +400,8 @@ public final class CharBuffer extends Writer { } return 0; } - - + + @Override public String toString() { return new String(this.buffer, this.offset, this.length); @@ -453,11 +462,11 @@ public final class CharBuffer extends Writer { } return p; } - + public static boolean equals(final char[] buffer, final char[] pattern) { return equals(buffer, 0, pattern); } - + public static boolean equals(final char[] buffer, final int offset, final char[] pattern) { // compares two char arrays: true, if pattern appears completely at offset position if (buffer.length < offset + pattern.length) return false; @@ -468,20 +477,20 @@ public final class CharBuffer extends Writer { public void reset() { this.length = 0; this.offset = 0; - } - - public void reset(final int newSize) { + } + + public void reset(final int newSize) { this.resize(newSize); this.reset(); - } - + } + public void resize(final int newSize) { if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize); final char[] v = new char[newSize]; System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize); - this.buffer = v; + this.buffer = v; } - + public char toCharArray()[] { final char[] newbuf = new char[this.length]; System.arraycopy(this.buffer, 0, newbuf, 0, this.length); @@ -490,12 +499,12 @@ public final class CharBuffer extends Writer { @Override public void close() throws IOException { - this.buffer = null; // assist with garbage collection + this.buffer = null; // assist with garbage collection } @Override public void flush() throws IOException { - // TODO Auto-generated method stub - } - + // TODO Auto-generated method stub + } + } \ No newline at end of file