memory hacks

pull/1/head
Michael Peter Christen 13 years ago
parent b4409cc803
commit 4540174fe0

@ -143,7 +143,7 @@ public class BookmarkHelper {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer= new TransformerWriter(null,null,scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);
writer.close();
links = scraper.getAnchors();

@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler {
if (mimeType.startsWith("text")) {
// every text-file distributed by yacy is UTF-8
if(!path.startsWith("/repository")) {
if (!path.startsWith("/repository")) {
mimeType = mimeType + "; charset=UTF-8";
} else {
// detect charset of html-files
if((path.endsWith("html") || path.endsWith("htm"))) {
if ((path.endsWith("html") || path.endsWith("htm"))) {
// save position
fis.mark(1000);
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset != null)
mimeType = mimeType + "; charset="+charset;
htmlFilter.close();
if (charset != null) mimeType = mimeType + "; charset="+charset;
// reset position
fis.reset();
}

@ -485,17 +485,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
writer.close();
} catch (final IOException e) {
Log.logException(e);
return cleanLine(super.stripAll(inlineHtml));
} finally {
scraper.close();
try {
writer.close();
} catch (IOException e) {
}
}
for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
mergeAnchors(entry.getKey(), entry.getValue());
}
this.images.putAll(scraper.images);
return cleanLine(super.stripAll(scraper.content.getChars()));
String line = cleanLine(super.stripAll(scraper.content.getChars()));
scraper.close();
return line;
}
private final static String cleanLine(final String s) {
@ -885,14 +892,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset == null)
charset = Charset.defaultCharset().toString();
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();
return scraper;
}

@ -34,7 +34,6 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
public class ContentTransformer extends AbstractTransformer implements Transformer {
@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
bb.append("</FONT> ");
final char[] result = bb.getChars();
try {
bb.close();
} catch (IOException e) {
Log.logException(e);
}
bb.close();
return result;
}

@ -101,6 +101,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
return null;
}
@Override
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname == null || tagname.length() == 0) return;
@ -123,6 +124,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
}
}
@Override
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
if (tagname == null || tagname.length() == 0) return;
@ -155,6 +157,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
return (this.charsetChanged) ? this.detectedCharset : null;
}
@Override
public int read() throws IOException {
// mode 0 is called from within the detectCharset function
if (this.mode == MODE_PRESCAN) {
@ -166,5 +169,9 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
return this.bufferedIn.read();
}
@Override
public void close() throws IOException {
if (this.writer != null) this.writer.close();
}
}

@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer {
}
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer {
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer {
}
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer {
final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
try {
cb.close();
} catch (final IOException e) {
Log.logException(e);
}
cb.close();
return result;
}
@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer {
result = bb.getChars(1);
else
result = bb.getChars();
try {
bb.close();
} catch (final IOException ex) {
Log.logException(ex);
}
bb.close();
return result;
}
@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
charBuffer.close();
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer {
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
scb.close();
}
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer {
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
scb.close();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {

@ -151,7 +151,6 @@ public class pdfParser extends AbstractParser implements Parser {
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close();
} catch (final IOException e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser {
//throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (final IOException e) {}
writer.close();
}
String[] docKeywords = null;

@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata {
final String dc_publisher,
final float lat,
final float lon) {
final CharBuffer s = new CharBuffer(20000, 360);
final CharBuffer s = new CharBuffer(3600, 360);
s.append(url.toNormalform(false, true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);

@ -130,7 +130,7 @@ public final class CharBuffer extends Writer {
}
private void grow(int minSize) {
int newsize = 2 * Math.max(this.buffer.length, minSize);
int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
char[] tmp = new char[newsize];
System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
this.buffer = tmp;
@ -478,15 +478,12 @@ public final class CharBuffer extends Writer {
this.offset = 0;
}
public void reset(final int newSize) {
this.resize(newSize);
this.reset();
}
public void resize(final int newSize) {
if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize);
final char[] v = new char[newSize];
System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize);
/**
* call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently
*/
public void trimToSize() {
final char[] v = new char[this.length];
System.arraycopy(this.buffer, this.offset, v, 0, this.length);
this.buffer = v;
}
@ -497,13 +494,15 @@ public final class CharBuffer extends Writer {
}
@Override
public void close() throws IOException {
public void close() {
this.length = 0;
this.offset = 0;
this.buffer = null; // assist with garbage collection
}
@Override
public void flush() throws IOException {
// TODO Auto-generated method stub
public void flush() {
trimToSize();
}
}
Loading…
Cancel
Save