Adding a limit of 1000 links that a parser shall store during indexing.

A limit was necessary because some web pages have such huge numbers of links that it can easily cause a OOM just by the number of links. The quesion if the number of 1000 links is sufficient or too weak must be answered with the result of testing this feature.
13 years ago · 276a66a793
parent 613b45f604
commit 276a66a793
7 changed files with 28 additions and 23 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -455,7 +455,7 @@ public class Crawler_p {
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
-                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
+                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile), 10000);
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
--- a/source/de/anomic/data/BookmarkHelper.java
+++ b/source/de/anomic/data/BookmarkHelper.java
@ -141,7 +141,7 @@ public class BookmarkHelper {
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
-            final ContentScraper scraper = new ContentScraper(baseURL);
+            final ContentScraper scraper = new ContentScraper(baseURL, 10000);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
            FileUtils.copy(input,writer);
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -492,7 +492,7 @@ public final class HTTPDFileHandler {
                            aBuffer.append("    <li><a href=\"" + path + element + "/\">" + element + "/</a><br/></li>\n");
                        } else {
                            if (element.endsWith("html") || (element.endsWith("htm"))) {
-                                scraper = ContentScraper.parseResource(f);
+                                scraper = ContentScraper.parseResource(f, 10000);
                                headline = scraper.getTitle();
                                author = scraper.getAuthor();
                                publisher = scraper.getPublisher();
@ -1055,7 +1055,7 @@ public final class HTTPDFileHandler {
                                // save position
                                fis.mark(1000);
                                // scrape document to look up charset
-                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
+                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false, 10);
                                final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
                                htmlFilter.close();
                                if (charset != null) mimeType = mimeType + "; charset="+charset;
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -32,7 +32,6 @@ import java.io.Writer;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@ -47,6 +46,7 @@ import javax.swing.event.EventListenerList;

 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.sorting.ClusteredScoreMap;
+import net.yacy.cora.storage.HashARC;
 import net.yacy.cora.util.NumberTools;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.parser.htmlParser;
@ -131,6 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final EventListenerList htmlFilterEventListeners;
    private double lon, lat;
    private MultiProtocolURI canonical;
+    private final int maxLinks;


    /**
@ -149,21 +150,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final Evaluation evaluationScores;

    @SuppressWarnings("unchecked")
-    public ContentScraper(final MultiProtocolURI root) {
+    public ContentScraper(final MultiProtocolURI root, int maxLinks) {
        // the root value here will not be used to load the resource.
        // it is only the reference for relative links
        super(linkTags0, linkTags1);
        assert root != null;
        this.root = root;
+        this.maxLinks = maxLinks;
        this.evaluationScores = new Evaluation();
-        this.rss = new HashMap<MultiProtocolURI, String>();
-        this.css = new HashMap<MultiProtocolURI, String>();
-        this.anchors = new HashMap<MultiProtocolURI, Properties>();
-        this.images = new HashMap<MultiProtocolURI, ImageEntry>();
-        this.embeds = new HashMap<MultiProtocolURI, EmbedEntry>();
+        this.rss = new HashARC<MultiProtocolURI, String>(maxLinks);
+        this.css = new HashARC<MultiProtocolURI, String>(maxLinks);
+        this.anchors = new HashARC<MultiProtocolURI, Properties>(maxLinks);
+        this.images = new HashARC<MultiProtocolURI, ImageEntry>(maxLinks);
+        this.embeds = new HashARC<MultiProtocolURI, EmbedEntry>(maxLinks);
        this.frames = new HashSet<MultiProtocolURI>();
        this.iframes = new HashSet<MultiProtocolURI>();
-        this.metas = new HashMap<String, String>();
+        this.metas = new HashARC<String, String>(maxLinks);
        this.script = new HashSet<MultiProtocolURI>();
        this.title = EMPTY_STRING;
        this.headlines = new ArrayList[6];
@ -514,7 +516,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {

        // start a new scraper to parse links inside this text
        // parsing the content
-        final ContentScraper scraper = new ContentScraper(this.root);
+        final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks);
        final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
        try {
            FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -912,19 +914,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }

-    public static ContentScraper parseResource(final File file) throws IOException {
+    public static ContentScraper parseResource(final File file, final int maxLinks) throws IOException {
        // load page
        final byte[] page = FileUtils.read(file);
        if (page == null) throw new IOException("no content in file " + file.toString());

        // scrape document to look up charset
-        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
+        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false, maxLinks);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        htmlFilter.close();
        if (charset == null) charset = Charset.defaultCharset().toString();

        // scrape content
-        final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
+        final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"), maxLinks);
        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
        FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
        writer.close();
--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@ -60,13 +60,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
            final String inputStreamCharset,
            final MultiProtocolURI rooturl,
            final Transformer transformer,
-            final boolean passbyIfBinarySuspect
+            final boolean passbyIfBinarySuspect,
+            final int maxLinks
    ) {
        // create a input stream for buffereing
        this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
        this.bufferedIn.mark((int) preBufferSize);

-        final ContentScraper scraper = new ContentScraper(rooturl);
+        final ContentScraper scraper = new ContentScraper(rooturl, maxLinks);
        scraper.registerHtmlFilterEventListener(this);

        try {
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -544,7 +544,7 @@ public final class TransformerWriter extends Writer {
        System.exit(0);
        final char[] buffer = new char[512];
        try {
-            final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"));
+            final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"), 1000);
            final Transformer transformer = new ContentTransformer();
            final Reader is = new FileReader(args[0]);
            final FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -52,6 +52,7 @@ import com.ibm.icu.text.CharsetDetector;
 public class htmlParser extends AbstractParser implements Parser {

    private static final Pattern patternUnderline = Pattern.compile("_");
+    private static final int maxLinks = 1000;

    public htmlParser() {
        super("Streaming HTML Parser");
@ -93,7 +94,7 @@ public class htmlParser extends AbstractParser implements Parser {

        try {
            // first get a document from the parsed html
-            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
+            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
            final Document document = transformScraper(location, mimeType, documentCharset, scraper);

            return new Document[]{document};
@ -151,7 +152,8 @@ public class htmlParser extends AbstractParser implements Parser {
    public static ContentScraper parseToScraper(
            final MultiProtocolURI location,
            final String documentCharset,
-            InputStream sourceStream) throws Parser.Failure, IOException {
+            InputStream sourceStream,
+            final int maxLinks) throws Parser.Failure, IOException {

        // make a scraper
        String charset = null;
@ -164,7 +166,7 @@ public class htmlParser extends AbstractParser implements Parser {
        // nothing found: try to find a meta-tag
        if (charset == null) {
            try {
-                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
+                final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
                htmlFilter.close();
@ -198,7 +200,7 @@ public class htmlParser extends AbstractParser implements Parser {
        }

        // parsing the content
-        final ContentScraper scraper = new ContentScraper(location);
+        final ContentScraper scraper = new ContentScraper(location, maxLinks);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(4096, sourceStream.available()));
        try {
            FileUtils.copy(sourceStream, writer, c);