Adding a limit of 1000 links that a parser shall store during indexing.

A limit was necessary because some web pages have such huge numbers of
links that it can easily cause a OOM just by the number of links.
The quesion if the number of 1000 links is sufficient or too weak must
be answered with the result of testing this feature.
pull/1/head
Michael Peter Christen 13 years ago
parent 613b45f604
commit 276a66a793

@ -455,7 +455,7 @@ public class Crawler_p {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile), 10000);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);

@ -141,7 +141,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL);
final ContentScraper scraper = new ContentScraper(baseURL, 10000);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);

@ -492,7 +492,7 @@ public final class HTTPDFileHandler {
aBuffer.append(" <li><a href=\"" + path + element + "/\">" + element + "/</a><br/></li>\n");
} else {
if (element.endsWith("html") || (element.endsWith("htm"))) {
scraper = ContentScraper.parseResource(f);
scraper = ContentScraper.parseResource(f, 10000);
headline = scraper.getTitle();
author = scraper.getAuthor();
publisher = scraper.getPublisher();
@ -1055,7 +1055,7 @@ public final class HTTPDFileHandler {
// save position
fis.mark(1000);
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false, 10);
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset != null) mimeType = mimeType + "; charset="+charset;

@ -32,7 +32,6 @@ import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@ -47,6 +46,7 @@ import javax.swing.event.EventListenerList;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.storage.HashARC;
import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
@ -131,6 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
private MultiProtocolURI canonical;
private final int maxLinks;
/**
@ -149,21 +150,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Evaluation evaluationScores;
@SuppressWarnings("unchecked")
public ContentScraper(final MultiProtocolURI root) {
public ContentScraper(final MultiProtocolURI root, int maxLinks) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.maxLinks = maxLinks;
this.evaluationScores = new Evaluation();
this.rss = new HashMap<MultiProtocolURI, String>();
this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.embeds = new HashMap<MultiProtocolURI, EmbedEntry>();
this.rss = new HashARC<MultiProtocolURI, String>(maxLinks);
this.css = new HashARC<MultiProtocolURI, String>(maxLinks);
this.anchors = new HashARC<MultiProtocolURI, Properties>(maxLinks);
this.images = new HashARC<MultiProtocolURI, ImageEntry>(maxLinks);
this.embeds = new HashARC<MultiProtocolURI, EmbedEntry>(maxLinks);
this.frames = new HashSet<MultiProtocolURI>();
this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>();
this.metas = new HashARC<String, String>(maxLinks);
this.script = new HashSet<MultiProtocolURI>();
this.title = EMPTY_STRING;
this.headlines = new ArrayList[6];
@ -514,7 +516,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper(this.root);
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -912,19 +914,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public static ContentScraper parseResource(final File file) throws IOException {
public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException {
// load page
final byte[] page = FileUtils.read(file);
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false, maxLinks);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"), maxLinks);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

@ -60,13 +60,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
final String inputStreamCharset,
final MultiProtocolURI rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect
final boolean passbyIfBinarySuspect,
final int maxLinks
) {
// create a input stream for buffereing
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks);
scraper.registerHtmlFilterEventListener(this);
try {

@ -544,7 +544,7 @@ public final class TransformerWriter extends Writer {
System.exit(0);
final char[] buffer = new char[512];
try {
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"));
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"), 1000);
final Transformer transformer = new ContentTransformer();
final Reader is = new FileReader(args[0]);
final FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));

@ -52,6 +52,7 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
private static final int maxLinks = 1000;
public htmlParser() {
super("Streaming HTML Parser");
@ -93,7 +94,7 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
final Document document = transformScraper(location, mimeType, documentCharset, scraper);
return new Document[]{document};
@ -151,7 +152,8 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper(
final MultiProtocolURI location,
final String documentCharset,
InputStream sourceStream) throws Parser.Failure, IOException {
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {
// make a scraper
String charset = null;
@ -164,7 +166,7 @@ public class htmlParser extends AbstractParser implements Parser {
// nothing found: try to find a meta-tag
if (charset == null) {
try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
htmlFilter.close();
@ -198,7 +200,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(4096, sourceStream.available()));
try {
FileUtils.copy(sourceStream, writer, c);

Loading…
Cancel
Save