// ContentScraper.java // ----------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.document.parser.html; import java.io.ByteArrayInputStream; import java.io.CharArrayReader; import java.io.File; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.event.EventListenerList; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.storage.SizeLimitedMap; import net.yacy.cora.storage.SizeLimitedSet; import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; public class ContentScraper extends AbstractScraper implements Scraper { public static final int MAX_DOCSIZE = 40 * 1024 * 1024; private final char degree = '\u00B0'; private final char[] minuteCharsHTML = "'".toCharArray(); // statics: for initialization of the HTMLFilterAbstractScraper private static final Set linkTags0 = new HashSet(12,0.99f); private static final Set linkTags1 = new HashSet(15,0.99f); private static final Pattern LB = Pattern.compile("\n"); public enum TagType { singleton, pair; } public enum Tag { html(TagType.singleton), // scraped as singleton to get attached properties like 'lang' body(TagType.singleton), // scraped as singleton to get attached properties like 'class' div(TagType.singleton), // scraped as singleton to get attached properties like 'id' img(TagType.singleton), base(TagType.singleton), frame(TagType.singleton), meta(TagType.singleton), area(TagType.singleton), link(TagType.singleton), embed(TagType.singleton), //added by [MN] param(TagType.singleton), //added by [MN] iframe(TagType.singleton), // scraped as singleton to get such iframes that have no closing tag a(TagType.pair), h1(TagType.pair), h2(TagType.pair), h3(TagType.pair), h4(TagType.pair), h5(TagType.pair), h6(TagType.pair), title(TagType.pair), b(TagType.pair), strong(TagType.pair), u(TagType.pair), i(TagType.pair), li(TagType.pair), script(TagType.pair), style(TagType.pair); public TagType type; private Tag(final TagType type) { this.type = type; } } // all these tags must be given in lowercase, because the tags from the files are compared in lowercase static { for (final Tag tag: Tag.values()) { if (tag.type == TagType.singleton) linkTags0.add(tag.name()); if (tag.type == TagType.pair) linkTags1.add(tag.name()); } //