From 4304e07e6fc7be21293dac9af601c1e5a241119f Mon Sep 17 00:00:00 2001 From: Michael Christen Date: Sun, 15 Jan 2023 01:20:12 +0100 Subject: [PATCH] crawl profile adoption to new tag valency attribute --- source/net/yacy/crawler/CrawlSwitchboard.java | 55 ++-- .../net/yacy/crawler/RecrawlBusyThread.java | 4 +- .../net/yacy/crawler/data/CrawlProfile.java | 169 ++++++----- .../net/yacy/crawler/retrieval/Response.java | 11 +- source/net/yacy/document/AbstractParser.java | 71 ++--- source/net/yacy/document/Parser.java | 130 ++++---- source/net/yacy/document/TextParser.java | 78 +++-- .../document/importer/MediawikiImporter.java | 205 ++++++------- .../parser/AbstractCompressorParser.java | 282 ++++++++++-------- .../net/yacy/document/parser/bzipParser.java | 139 ++++----- .../net/yacy/document/parser/gzipParser.java | 214 ++++++------- .../net/yacy/document/parser/htmlParser.java | 154 +++++++--- .../yacy/document/parser/sevenzipParser.java | 27 +- .../net/yacy/document/parser/tarParser.java | 280 ++++++++--------- .../net/yacy/document/parser/zipParser.java | 6 +- source/net/yacy/htroot/Crawler_p.java | 1 + source/net/yacy/htroot/QuickCrawlLink_p.java | 3 +- .../net/yacy/repository/LoaderDispatcher.java | 11 +- source/net/yacy/search/Switchboard.java | 6 +- .../net/yacy/search/index/DocumentIndex.java | 21 +- 20 files changed, 1024 insertions(+), 843 deletions(-) diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index bbcbc4018..aca7b95cd 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; @@ -60,7 +61,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; public final class CrawlSwitchboard { - + public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob"; @@ -75,7 +76,7 @@ public final class CrawlSwitchboard { public static Set DEFAULT_PROFILES = new HashSet(); static { - DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP); + DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP); DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW); DEFAULT_PROFILES.add(CRAWL_PROFILE_RECRAWL_JOB); DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); @@ -93,11 +94,11 @@ public final class CrawlSwitchboard { // Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */ - /** - * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date - * limit can be set up by the recrawl job selection query, but a default limit - * prevent unwanted overload on targets) - */ + /** + * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date + * limit can be set up by the recrawl job selection query, but a default limit + * prevent unwanted overload on targets) + */ public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days @@ -139,7 +140,7 @@ public final class CrawlSwitchboard { try { p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); } catch (final IOException | SpaceExceededException | RuntimeException e ) { - ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e); + ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e); p = null; } if ( p == null ) { @@ -275,16 +276,15 @@ public final class CrawlSwitchboard { public RowHandleSet getURLHashes(final byte[] profileKey) { return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey)); } - - + private void initActiveCrawlProfiles() { - final Switchboard sb = Switchboard.getSwitchboard(); - - // generate new default entry for deep auto crawl - this.defaultAutocrawlDeepProfile = - new CrawlProfile( - CRAWL_PROFILE_AUTOCRAWL_DEEP, - CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch + final Switchboard sb = Switchboard.getSwitchboard(); + + // generate new default entry for deep auto crawl + this.defaultAutocrawlDeepProfile = + new CrawlProfile( + CRAWL_PROFILE_AUTOCRAWL_DEEP, + CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch @@ -308,12 +308,13 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP, ClientIdentification.yacyInternetCrawlerAgentName, + TagValency.EVAL, null, null, 0); - this.profilesActiveCrawls.put( - UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()), - this.defaultAutocrawlDeepProfile); - // generate new default entry for shallow auto crawl + this.profilesActiveCrawls.put( + UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()), + this.defaultAutocrawlDeepProfile); + // generate new default entry for shallow auto crawl this.defaultAutocrawlShallowProfile = new CrawlProfile( CRAWL_PROFILE_AUTOCRAWL_SHALLOW, @@ -341,6 +342,7 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW, ClientIdentification.yacyInternetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -364,7 +366,7 @@ public final class CrawlSwitchboard { true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, - false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, + false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, @@ -373,6 +375,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -405,6 +408,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -437,6 +441,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -469,6 +474,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -509,6 +515,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -541,6 +548,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -573,6 +581,7 @@ public final class CrawlSwitchboard { CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -605,6 +614,7 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put( @@ -640,6 +650,7 @@ public final class CrawlSwitchboard { CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName, + TagValency.EVAL, null, null, 0); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index 7c3571705..af32db0cf 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.retrieval.Request; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; @@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread { true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, - ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0); + ClientIdentification.yacyInternetCrawlerAgentName, + TagValency.EVAL, null, null, 0); return profile; } diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 72d7bff9a..7acb8bd2c 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.data.word.Word; import net.yacy.search.query.QueryParams; import net.yacy.search.schema.CollectionSchema; @@ -69,19 +70,19 @@ public class CrawlProfile extends ConcurrentHashMap implements M /** Regular expression pattern matching everything */ public static final String MATCH_ALL_STRING = ".*"; - + /** Regular expression pattern matching nothing */ public static final String MATCH_NEVER_STRING = ""; - + /** Empty Solr query */ public static final String SOLR_EMPTY_QUERY = ""; - + /** Match all Solr query */ public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY; - + /** Regular expression matching everything */ public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); - + /** Regular expression matching nothing */ public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); @@ -126,14 +127,15 @@ public class CrawlProfile extends ConcurrentHashMap implements M INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"), INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"), COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"), - IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"), + DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"), + VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"), SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"), TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent"); - + public static final int BOOLEAN = 0; public static final int INTEGER = 1; public static final int STRING = 2; - + public final String key, label; public final boolean readonly; public final int type; @@ -143,39 +145,39 @@ public class CrawlProfile extends ConcurrentHashMap implements M this.type = type; this.label = label; } - + @Override public String toString() { return this.key; } } - - + private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; - + /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */ private Pattern crawlerOriginUrlMustMatch = null; - + /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */ private Pattern crawlerOriginUrlMustNotMatch = null; - + private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; - + /** Pattern on the media type documents must match before being indexed * @see CollectionSchema#content_type */ private Pattern indexMediaTypeMustMatch = null; - + /** Pattern on the media type documents must not match before being indexed * @see CollectionSchema#content_type */ private Pattern indexMediaTypeMustNotMatch = null; - + private Pattern snapshotsMustnotmatch = null; private final Map doms; - private final Set ignore_class_name; + private final TagValency defaultValency; + private final Set valencySwitchTagNames; private final VocabularyScraper scraper; /** @@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final CacheStrategy cacheStrategy, final String collections, final String userAgentName, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset) { super(40); @@ -252,40 +255,42 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CrawlAttribute.NAME.key, name); put(CrawlAttribute.AGENT_NAME.key, userAgentName); put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true); - put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); - put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); - put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); - put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); - put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); - put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); - put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch); + put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); + put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); + put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); + put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); + put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); + put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); + put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch); put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch); - put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch); - put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch); - put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch); + put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch); + put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch); + put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch); put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); - put(CrawlAttribute.DEPTH.key, depth); - put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL); - put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); - put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages); - put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?' - put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames - put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored + put(CrawlAttribute.DEPTH.key, depth); + put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL); + put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); + put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages); + put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?' + put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames + put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow); - put(CrawlAttribute.INDEX_TEXT.key, indexText); - put(CrawlAttribute.INDEX_MEDIA.key, indexMedia); - put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache); - put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing); - put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth); - put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage); - put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld); - put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch); - put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString()); - put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); + put(CrawlAttribute.INDEX_TEXT.key, indexText); + put(CrawlAttribute.INDEX_MEDIA.key, indexMedia); + put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache); + put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing); + put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth); + put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage); + put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld); + put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch); + put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString()); + put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); // we transform the ignore_class_name and scraper information into a JSON Array - this.ignore_class_name = ignore_class_name == null ? new HashSet() : ignore_class_name; - String jsonString = new JSONArray(ignore_class_name).toString(); - put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString); + this.defaultValency = defaultValency; + this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet() : valencySwitchTagNames; + String jsonString = new JSONArray(valencySwitchTagNames).toString(); + put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name()); + put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString); this.scraper = scraper == null ? new VocabularyScraper() : scraper; jsonString = this.scraper.toString(); assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; @@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M super(ext == null ? 1 : ext.size()); if (ext != null) putAll(ext); this.doms = new ConcurrentHashMap(); - String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key); + String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key); + this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency); + String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key); JSONArray a; - if(jsonString == null) { + if (jsonString == null) { a = new JSONArray(); } else { try { @@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap implements M a = new JSONArray(); } } - this.ignore_class_name = new HashSet(); + this.valencySwitchTagNames = new HashSet(); for (int i = 0; i < a.length(); i++) try { - this.ignore_class_name.add(a.getString(i)); + this.valencySwitchTagNames.add(a.getString(i)); } catch (JSONException e) {} jsonString = ext.get(CrawlAttribute.SCRAPER.key); if (jsonString == null || jsonString.length() == 0) { @@ -336,14 +343,18 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } - public Set ignoreDivClassName() { - return this.ignore_class_name; + public TagValency defaultValency() { + return this.defaultValency; + } + + public Set valencySwitchTagNames() { + return this.valencySwitchTagNames; } public VocabularyScraper scraper() { return this.scraper; } - + public void domInc(final String domain) { if (domain == null) return; // may be correct for file system crawls final AtomicInteger dp = this.doms.get(domain); @@ -427,7 +438,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M //if (r == null) return null; return r; } - + private Map cmap = null; /** @@ -440,7 +451,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M this.cmap = collectionParser(r); return this.cmap; } - + public static Map collectionParser(String collectionString) { if (collectionString == null || collectionString.length() == 0) return new HashMap(); String[] cs = CommonPattern.COMMA.split(collectionString); @@ -470,7 +481,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String r = get(CrawlAttribute.COLLECTIONS.key); return r == null || r.length() == 0 || "user".equals(r) ? name() : r; } - + /** * Gets the regex which must be matched by URLs in order to be crawled. * @return regex which must be matched @@ -484,7 +495,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.crawlerurlmustmatch; } - + /** * Render the urlMustMatchPattern as a String of limited size, suffixing it with * "..." when it is truncated. Used to prevent unnecessary growth of the logs, @@ -516,7 +527,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.crawlerurlmustnotmatch; } - + /** * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack * @@ -538,7 +549,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.crawlerOriginUrlMustMatch; } - + /** * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack * @@ -601,7 +612,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (list.length == 1 && list.length == 0) list = new String[0]; return list; } - + /** * If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0) * @return regex which must be matched @@ -643,7 +654,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.indexurlmustnotmatch; } - + /** * Gets the regex which must be matched by URLs in order to be indexed. * @return regex which must be matched @@ -671,7 +682,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.indexcontentmustnotmatch; } - + /** * Get the Pattern on media type that documents must match in order to be indexed * @@ -693,7 +704,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.indexMediaTypeMustMatch; } - + /** * Get the Pattern on media type that documents must not match in order to be indexed * @@ -715,9 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.indexMediaTypeMustNotMatch; } - - - + /** * Gets depth of crawl job (or height of the tree which will be * created by the crawler). @@ -743,7 +752,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + /** * @return true when the crawler must always cross check the eventual URL file * extension against the actual Media Type, even when file extension is @@ -772,7 +781,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public void setCacheStrategy(final CacheStrategy newStrategy) { put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString()); } - + /** * Gets the minimum date that an entry must have to be re-crawled. * @return time in ms representing a date @@ -847,13 +856,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public boolean remoteIndexing() { final String r = get(CrawlAttribute.REMOTE_INDEXING.key); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public int snapshotMaxdepth() { final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key); if (r == null) return -1; @@ -866,7 +875,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M return -1; } } - + public boolean snapshotLoadImage() { final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key); if (r == null) return false; @@ -878,7 +887,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public Pattern snapshotsMustnotmatch() { if (this.snapshotsMustnotmatch == null) { final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key); @@ -887,7 +896,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.snapshotsMustnotmatch; - } + } public int timezoneOffset() { final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key); @@ -898,7 +907,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M return 0; } } - + /** * get a recrawl date for a given age in minutes * @param oldTimeMinutes @@ -946,7 +955,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString(); } - + public boolean isPushCrawlProfile() { return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB); } @@ -1008,7 +1017,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); - + int i = 0; if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) { String item; @@ -1021,7 +1030,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); } - + public static void main(String[] args) { // test to convert the key set from set to string and back Set a = new HashSet<>(); diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 42a1558c7..acf7c6fd0 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -48,6 +48,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.search.Switchboard; public class Response { @@ -853,7 +854,7 @@ public class Response { // 4) proxy-load (initiator is "------------") // 5) local prefetch/crawling (initiator is own seedHash) // 6) local fetching for global crawling (other known or unknown initiator) - // 7) local surrogates processing (can not be known here : crawl profile is required) + // 7) local surrogates processing (can not be known here : crawl profile is required) EventOrigin processCase = EventOrigin.UNKNOWN; // FIXME the equals seems to be incorrect: String.equals(boolean) if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) { @@ -873,9 +874,13 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); + return TextParser.parseSource( + url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), + this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), + TagValency.EVAL, new HashSet(), + new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); } catch(Parser.Failure e) { - throw e; + throw e; } catch (final Exception e) { return null; } diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index 893687497..f8132be1e 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -32,6 +32,7 @@ import java.util.Set; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.parser.html.TagValency; public abstract class AbstractParser implements Parser { @@ -41,20 +42,20 @@ public abstract class AbstractParser implements Parser { protected final Set SUPPORTED_MIME_TYPES = new LinkedHashSet(); protected final Set SUPPORTED_EXTENSIONS = new HashSet(); private final String name; - + /** * initialize a parser with a name * @param name */ public AbstractParser(final String name) { - this.name = name; - } + this.name = name; + } /* * The following abstract implementations create a circular call which would cause an endless loop when called. * They are both here because one of them must be overridden by the implementing class. */ - + @Override public Document[] parse( DigestURL url, @@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser { int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException { - return parse(url, mimeType, charset, new HashSet(), scraper, timezoneOffset, source); + return parse(url, mimeType, charset, TagValency.EVAL, new HashSet(), scraper, timezoneOffset, source); } @Override @@ -72,15 +73,15 @@ public abstract class AbstractParser implements Parser { DigestURL url, String mimeType, String charset, - Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, VocabularyScraper scraper, int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException { - return parse(url, mimeType, charset, scraper, timezoneOffset, source); + return parse(url, mimeType, charset, scraper, timezoneOffset, source); } - - + /* * The following abstract implementations create a circular call which would cause an endless loop when called. * They are both here because one of them must be overridden by the implementing class. @@ -88,32 +89,33 @@ public abstract class AbstractParser implements Parser { @Override public Document[] parseWithLimits( - final DigestURL location, - final String mimeType, - final String charset, - final VocabularyScraper scraper, - final int timezoneOffset, - final InputStream source, - final int maxLinks, - final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException { - return parseWithLimits(location, mimeType, charset, new HashSet(), scraper, timezoneOffset, source, maxLinks, maxBytes); + final DigestURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source, + final int maxLinks, + final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException { + return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet(), scraper, timezoneOffset, source, maxLinks, maxBytes); } - + @Override public Document[] parseWithLimits( - DigestURL location, - String mimeType, - String charset, - final Set ignore_class_name, - VocabularyScraper scraper, - int timezoneOffset, - InputStream source, - int maxLinks, - long maxBytes) - throws Failure, InterruptedException, UnsupportedOperationException { - return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes); + DigestURL location, + String mimeType, + String charset, + final TagValency defaultValency, + final Set valencySwitchTagNames, + VocabularyScraper scraper, + int timezoneOffset, + InputStream source, + int maxLinks, + long maxBytes) + throws Failure, InterruptedException, UnsupportedOperationException { + return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes); } - + /** * return the name of the parser */ @@ -164,12 +166,11 @@ public abstract class AbstractParser implements Parser { if (t != null) c.add(t); return c; } - + @Override public boolean isParseWithLimitsSupported() { - /* Please override on subclasses when parseWithLimits is supported */ - return false; + /* Please override on subclasses when parseWithLimits is supported */ + return false; } - } diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index fda309be9..f2940a02c 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -28,6 +28,7 @@ import java.util.Set; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.document.parser.html.TagValency; public interface Parser { @@ -63,72 +64,87 @@ public interface Parser { int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException; - + public Document[] parse( DigestURL url, String mimeType, String charset, - Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, VocabularyScraper scraper, int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException; - + + /** + * Parse an input stream, eventually terminating processing when a total of + * maxLinks URLS (anchors, images links, media links...) have been reached, + * or when maxBytes content bytes have been processed, thus potentially + * resulting in partially parsed documents (with + * {@link Document#isPartiallyParsed()} returning true). Some parser + * implementations will not support parsing within maxLinks or maxBytes + * limits : make sure to check this by calling fist + * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException + * could be thrown. + * + * @param url + * the URL of the source + * @param mimeType + * the mime type of the source, if known + * @param charset + * the charset name of the source, if known + * @param scraper + * an entity scraper to detect facets from text annotation + * context + * @param timezoneOffset + * the local time zone offset + * @param source + * a input stream + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with + * empty or null text. + * @throws Parser.Failure + * when the parser processing failed + * @throws InterruptedException + * when the processing was interrupted before termination + * @throws UnsupportedOperationException + * when the parser implementation doesn't support parsing within + * limits + */ + public Document[] parseWithLimits( + DigestURL url, + String mimeType, + String charset, + VocabularyScraper scraper, + int timezoneOffset, + InputStream source, + int maxLinks, + long maxBytes) + throws Parser.Failure, InterruptedException, UnsupportedOperationException; + + + public Document[] parseWithLimits( + final DigestURL location, + final String mimeType, + final String documentCharset, + final TagValency defaultValency, + final Set valencySwitchTagNames, + final VocabularyScraper vocscraper, + final int timezoneOffset, + final InputStream sourceStream, + final int maxLinks, + final long maxBytes) + throws Parser.Failure, InterruptedException, UnsupportedOperationException; + /** - * Parse an input stream, eventually terminating processing when a total of - * maxLinks URLS (anchors, images links, media links...) have been reached, - * or when maxBytes content bytes have been processed, thus potentially - * resulting in partially parsed documents (with - * {@link Document#isPartiallyParsed()} returning true). Some parser - * implementations will not support parsing within maxLinks or maxBytes - * limits : make sure to check this by calling fist - * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException - * could be thrown. - * - * @param url - * the URL of the source - * @param mimeType - * the mime type of the source, if known - * @param charset - * the charset name of the source, if known - * @param scraper - * an entity scraper to detect facets from text annotation - * context - * @param timezoneOffset - * the local time zone offset - * @param source - * a input stream - * @param maxLinks - * the maximum total number of links to parse and add to the - * result documents - * @param maxBytes - * the maximum number of content bytes to process - * @return a list of documents that result from parsing the source, with - * empty or null text. - * @throws Parser.Failure - * when the parser processing failed - * @throws InterruptedException - * when the processing was interrupted before termination - * @throws UnsupportedOperationException - * when the parser implementation doesn't support parsing within - * limits - */ - public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, - VocabularyScraper scraper, - int timezoneOffset, InputStream source, int maxLinks, long maxBytes) - throws Parser.Failure, InterruptedException, UnsupportedOperationException; - - - public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, - final Set ignore_class_name, final VocabularyScraper vocscraper, - final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes) - throws Parser.Failure, InterruptedException, UnsupportedOperationException; - - /** - * @return true when the parser implementation supports the - * parseWithLimits() operation. - */ - public boolean isParseWithLimitsSupported(); + * @return true when the parser implementation supports the + * parseWithLimits() operation. + */ + public boolean isParseWithLimitsSupported(); // methods to that shall make it possible to put Parser objects into a hashtable diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 83327adcc..6fefa60c8 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser; import net.yacy.document.parser.genericParser; import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException; +import net.yacy.document.parser.html.TagValency; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.linkScraperParser; import net.yacy.document.parser.mmParser; @@ -184,7 +185,8 @@ public final class TextParser { final DigestURL location, final String mimeType, final String charset, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final int depth, @@ -201,7 +203,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -218,7 +220,8 @@ public final class TextParser { final DigestURL location, String mimeType, final String charset, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final int depth, @@ -236,7 +239,7 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); - final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); + final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); return docs; } @@ -248,7 +251,8 @@ public final class TextParser { final DigestURL location, String mimeType, final String charset, - final Set ignoreClassNames, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final int depth, @@ -261,14 +265,15 @@ public final class TextParser { final Set idioms = new HashSet<>(); idioms.add(TextParser.genericIdiom); - return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); + return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); } private static Document[] parseSource( final DigestURL location, String mimeType, final String charset, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final int depth, @@ -330,7 +335,7 @@ public final class TextParser { CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream); try { - return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset, + return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, nonCloseInputStream, maxLinks, maxBytes); } catch (final Parser.Failure e) { /* Try to reset the marked stream. If the failed parser has consumed too many bytes : @@ -378,11 +383,11 @@ public final class TextParser { int maxBytesToRead = -1; if(maxBytes < Integer.MAX_VALUE) { /* Load at most maxBytes + 1 : - - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure - - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */ + - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure + - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */ maxBytesToRead = (int)maxBytes + 1; } - if(contentLength >= 0 && contentLength < maxBytesToRead) { + if (contentLength >= 0 && contentLength < maxBytesToRead) { maxBytesToRead = (int)contentLength; } @@ -392,16 +397,23 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); + final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); return docs; } - public static Document[] parseSource(final DigestURL location, String mimeType, final String charset, - final Set ignore_class_name, - final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength, + public static Document[] parseSource( + final DigestURL location, + String mimeType, + final String charset, + final TagValency defaultValency, + final Set valencySwitchTagNames, + final VocabularyScraper scraper, + final int timezoneOffset, + final int depth, + final long contentLength, final InputStream sourceStream) throws Parser.Failure { - return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream, + return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream, Integer.MAX_VALUE, Long.MAX_VALUE); } @@ -424,10 +436,19 @@ public final class TextParser { * @return a list of documents that result from parsing the source, with empty or null text. * @throws Parser.Failure when the parser processing failed */ - public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set ignoreClassNames, - final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, + public static Document[] parseWithLimits( + final DigestURL location, + String mimeType, + final String charset, + final TagValency defaultValency, + final Set valencySwitchTagNames, + final int timezoneOffset, + final int depth, + final long contentLength, + final InputStream sourceStream, + int maxLinks, long maxBytes) throws Parser.Failure{ - return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, + return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, sourceStream, maxLinks, maxBytes); } @@ -449,10 +470,11 @@ public final class TextParser { * @return a list of documents that result from parsing the source, with empty or null text. * @throws Parser.Failure when the parser processing failed */ - public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, + public static Document[] parseWithLimits( + final DigestURL location, String mimeType, final String charset, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, long maxBytes) throws Parser.Failure{ - return parseSource(location, mimeType, charset, new HashSet(), new VocabularyScraper(), timezoneOffset, depth, contentLength, + return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet(), new VocabularyScraper(), timezoneOffset, depth, contentLength, sourceStream, maxLinks, maxBytes); } @@ -475,7 +497,8 @@ public final class TextParser { final String mimeType, final Parser parser, final String charset, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final InputStream sourceStream, @@ -491,11 +514,11 @@ public final class TextParser { try { final Document[] docs; if(parser.isParseWithLimitsSupported()) { - docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); + docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); } else { /* Parser do not support partial parsing within limits : let's control it here*/ final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes); - docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource); + docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource); } return docs; } catch(final Parser.Failure e) { @@ -524,7 +547,8 @@ public final class TextParser { final String mimeType, final Set parsers, final String charset, - final Set ignore_class_name, + final TagValency defaultValency, + final Set valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final int depth, @@ -552,13 +576,13 @@ public final class TextParser { } try { if(parser.isParseWithLimitsSupported()) { - docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes); + docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes); } else { /* Partial parsing is not supported by this parser : check content length now */ if(sourceArray.length > maxBytes) { throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location); } - docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis); + docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis); } } catch (final Parser.Failure e) { if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index d6dbb5b83..b3d75776d 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -68,6 +68,7 @@ import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.document.content.SurrogateReader; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.NamePrefixThreadFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -101,11 +102,11 @@ public class MediawikiImporter extends Thread implements Importer { public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) { - super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")"); - this.sourcefile = sourcefile; - this.docsize = sourcefile.length(); - this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L); - this.targetdir = targetdir; + super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")"); + this.sourcefile = sourcefile; + this.docsize = sourcefile.length(); + this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L); + this.targetdir = targetdir; this.count = 0; this.start = 0; this.hostport = null; @@ -154,7 +155,7 @@ public class MediawikiImporter extends Thread implements Importer { } @SuppressWarnings("resource") - @Override + @Override public void run() { this.start = System.currentTimeMillis(); final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); @@ -179,8 +180,8 @@ public class MediawikiImporter extends Thread implements Importer { boolean page = false, text = false; String title = null; final BlockingQueue in = new ArrayBlockingQueue(threads * 10); - final ExecutorService service = Executors.newCachedThreadPool( - new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer")); + final ExecutorService service = Executors.newCachedThreadPool( + new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer")); final convertConsumer[] consumers = new convertConsumer[threads]; final Future[] consumerResults = (Future[]) Array.newInstance(Future.class, threads); for (int i = 0; i < threads; i++) { @@ -276,23 +277,23 @@ public class MediawikiImporter extends Thread implements Importer { consumerResults[i].get(10000, TimeUnit.MILLISECONDS); } } catch (final Exception e) { - this.errorMessage = e.getMessage(); + this.errorMessage = e.getMessage(); ConcurrentLog.logException(e); } finally { out.put(poison); // output thread condition (for file.close) writerResult.get(10000, TimeUnit.MILLISECONDS); } } catch (final Exception e) { - this.errorMessage = e.getMessage(); + this.errorMessage = e.getMessage(); ConcurrentLog.logException(e); } finally { - if(reader != null) { + if(reader != null) { try { - reader.close(); - } catch (IOException e) { - ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage()); - } - } + reader.close(); + } catch (IOException e) { + ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage()); + } + } try { out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block } catch (InterruptedException ex) { } @@ -310,7 +311,7 @@ public class MediawikiImporter extends Thread implements Importer { File mediawikixml; public indexMaker(final File mediawikixml) { - super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : ""); + super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : ""); this.mediawikixml = mediawikixml; } @@ -337,8 +338,8 @@ public class MediawikiImporter extends Thread implements Importer { final PositionAwareReader in = new PositionAwareReader(dumpFile); final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); final wikiConsumer consumer = new wikiConsumer(100, producer); - final ExecutorService service = Executors.newCachedThreadPool( - new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex")); + final ExecutorService service = Executors.newCachedThreadPool( + new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex")); final Future producerResult = service.submit(consumer); final Future consumerResult = service.submit(producer); service.shutdown(); @@ -535,14 +536,14 @@ public class MediawikiImporter extends Thread implements Importer { } public void genDocument() throws Parser.Failure { try { - this.url = new AnchorURL(this.urlStub + this.title); - final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html)); - this.document = Document.mergeDocuments(this.url, "text/html", parsed); - // the wiki parser is not able to find the proper title in the source text, so it must be set here - this.document.setTitle(this.title); - } catch (final MalformedURLException e1) { - ConcurrentLog.logException(e1); - } + this.url = new AnchorURL(this.urlStub + this.title); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html)); + this.document = Document.mergeDocuments(this.url, "text/html", parsed); + // the wiki parser is not able to find the proper title in the source text, so it must be set here + this.document.setTitle(this.title); + } catch (final MalformedURLException e1) { + ConcurrentLog.logException(e1); + } } public void writeXML(final OutputStreamWriter os) throws IOException { this.document.writeXML(os); @@ -676,9 +677,9 @@ public class MediawikiImporter extends Thread implements Importer { } catch (final Parser.Failure e) { ConcurrentLog.logException(e); } catch (final IOException e) { - // TODO Auto-generated catch block + // TODO Auto-generated catch block ConcurrentLog.logException(e); - } + } } } catch (final InterruptedException e) { ConcurrentLog.logException(e); @@ -772,78 +773,78 @@ public class MediawikiImporter extends Thread implements Importer { } - public static void main(final String[] s) { - if (s.length == 0) { - System.out.println("usage:"); - System.out.println(" -index "); - System.out.println(" -read "); - System.out.println(" -find <wikipedia-dump>"); - System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); - ConcurrentLog.shutdown(); - return; - } - - try { - // example: - // java -Xmx2000m -cp classes:lib/bzip2.jar - // de.anomic.tools.mediawikiIndex -convert - // DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 - // DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ - - if (s[0].equals("-convert")) { - if(s.length < 3) { - System.out.println("usage:"); - System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); - ConcurrentLog.shutdown(); - return; - } - final File targetdir = new File(s[2]); - try { - final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir); - mi.start(); - mi.join(); - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - } catch (MalformedURLException e) { - ConcurrentLog.logException(e); - } - } - - if (s[0].equals("-index")) { - try { - createIndex(new File(s[1])); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - } - - if (s[0].equals("-read")) { - final long start = Integer.parseInt(s[1]); - final int len = Integer.parseInt(s[2]); - System.out.println(UTF8.String(read(new File(s[3]), start, len))); - } - - if (s[0].equals("-find")) { - try { - final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); - if (w == null) { - ConcurrentLog.info("WIKITRANSLATION", "not found"); - } else { - System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start)))); - } - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - - } - } finally { - try { - HTTPClient.closeConnectionManager(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - ConcurrentLog.shutdown(); - } - } + public static void main(final String[] s) { + if (s.length == 0) { + System.out.println("usage:"); + System.out.println(" -index <wikipedia-dump>"); + System.out.println(" -read <start> <len> <idx-file>"); + System.out.println(" -find <title> <wikipedia-dump>"); + System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); + ConcurrentLog.shutdown(); + return; + } + + try { + // example: + // java -Xmx2000m -cp classes:lib/bzip2.jar + // de.anomic.tools.mediawikiIndex -convert + // DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 + // DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ + + if (s[0].equals("-convert")) { + if(s.length < 3) { + System.out.println("usage:"); + System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); + ConcurrentLog.shutdown(); + return; + } + final File targetdir = new File(s[2]); + try { + final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir); + mi.start(); + mi.join(); + } catch (final InterruptedException e) { + ConcurrentLog.logException(e); + } catch (MalformedURLException e) { + ConcurrentLog.logException(e); + } + } + + if (s[0].equals("-index")) { + try { + createIndex(new File(s[1])); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + } + + if (s[0].equals("-read")) { + final long start = Integer.parseInt(s[1]); + final int len = Integer.parseInt(s[2]); + System.out.println(UTF8.String(read(new File(s[3]), start, len))); + } + + if (s[0].equals("-find")) { + try { + final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); + if (w == null) { + ConcurrentLog.info("WIKITRANSLATION", "not found"); + } else { + System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start)))); + } + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + + } + } finally { + try { + HTTPClient.closeConnectionManager(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + ConcurrentLog.shutdown(); + } + } } diff --git a/source/net/yacy/document/parser/AbstractCompressorParser.java b/source/net/yacy/document/parser/AbstractCompressorParser.java index 753b894a4..09f385c05 100644 --- a/source/net/yacy/document/parser/AbstractCompressorParser.java +++ b/source/net/yacy/document/parser/AbstractCompressorParser.java @@ -37,6 +37,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; /** * Base class for parsing compressed files relying on Apache commons-compress @@ -44,25 +45,25 @@ import net.yacy.document.VocabularyScraper; */ public abstract class AbstractCompressorParser extends AbstractParser implements Parser { - /** Crawl depth applied when parsing internal compressed content */ - protected static final int DEFAULT_DEPTH = 999; - - /** - * @param name the human readable name of the parser - */ - public AbstractCompressorParser(final String name) { - super(name); - } - - /** - * @param source an open input stream on a compressed source - * @return a sub class of CompressorInputStream capable of uncompressing the source - * on the fly - * @throws IOException when an error occurred when trying to open the compressed - * stream - */ - protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException; - + /** Crawl depth applied when parsing internal compressed content */ + protected static final int DEFAULT_DEPTH = 999; + + /** + * @param name the human readable name of the parser + */ + public AbstractCompressorParser(final String name) { + super(name); + } + + /** + * @param source an open input stream on a compressed source + * @return a sub class of CompressorInputStream capable of uncompressing the source + * on the fly + * @throws IOException when an error occurred when trying to open the compressed + * stream + */ + protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException; + /** * Maps the given name of a compressed file to the name that the * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned. @@ -72,116 +73,137 @@ public abstract class AbstractCompressorParser extends AbstractParser implements */ protected abstract String getUncompressedFilename(final String filename); - @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, - final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, - final InputStream source) throws Parser.Failure, InterruptedException { - - return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE, - Long.MAX_VALUE); - } - - @Override - public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, - final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, - final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure { - Document maindoc; - final CompressorInputStream compressedInStream; - try { - compressedInStream = createDecompressStream(source); - } catch (final IOException | RuntimeException e) { - throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); - } - - try { - // create maindoc for this archive, register with supplied url & mime - maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this); - - final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset, - AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes); - if (docs != null) { - maindoc.addSubDocuments(docs); - if (docs.length > 0 && docs[0].isPartiallyParsed()) { - maindoc.setPartiallyParsed(true); - } - } - } catch (final Parser.Failure e) { - throw e; - } catch (final IOException | RuntimeException e) { - throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); - } - return new Document[] { maindoc }; - } - - /** - * Create the main parsed document for the compressed document at the given URL - * and Media type - * - * @param location the parsed resource URL - * @param mimeType the media type of the resource - * @param charset the charset name if known - * @param parser an instance of CompressorParser that is registered as the - * parser origin of the document - * @return a Document instance - */ - protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, - final AbstractCompressorParser parser) { - final String filename = location.getFileName(); - return new Document(location, mimeType, charset, parser, null, null, - AbstractParser - .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title - null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); - } - - /** - * Parse content in an open stream uncompressing on the fly a compressed - * resource. - * - * @param location the URL of the compressed resource - * @param charset the charset name if known - * @param ignoreClassNames an eventual set of CSS class names whose matching - * html elements content should be ignored - * @param timezoneOffset the local time zone offset - * @param compressedInStream an open stream uncompressing on the fly the - * compressed content - * @param maxLinks the maximum total number of links to parse and add - * to the result documents - * @param maxBytes the maximum number of content bytes to process - * @return a list of documents that result from parsing the source, with empty - * or null text. - * @throws Parser.Failure when the parser processing failed - */ - protected Document[] parseCompressedInputStream(final DigestURL location, final String charset, - final Set<String> ignoreClassNames, final int timezoneOffset, final int depth, - final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { - final String compressedFileName = location.getFileName(); - final String contentfilename = getUncompressedFilename(compressedFileName); - final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); - try { - /* - * Use the uncompressed file name for sub parsers to not unnecessarily use again - * this same uncompressing parser - */ - final String locationPath = location.getPath(); - final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) - + contentfilename; - final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), - location.getPort(), contentPath); - - /* - * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on - * compressed content - */ - return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth, - -1, compressedInStream, maxLinks, maxBytes); - } catch (final MalformedURLException e) { - throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); - } - } - - @Override - public boolean isParseWithLimitsSupported() { - return true; - } + @Override + public Document[] parse( + final DigestURL location, + final String mimeType, + final String charset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { + + return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE, + Long.MAX_VALUE); + } + + @Override + public Document[] parseWithLimits( + final DigestURL location, + final String mimeType, + final String charset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source, + final int maxLinks, + final long maxBytes) throws Parser.Failure { + Document maindoc; + final CompressorInputStream compressedInStream; + try { + compressedInStream = createDecompressStream(source); + } catch (final IOException | RuntimeException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + + try { + // create maindoc for this archive, register with supplied url & mime + maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this); + + final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset, + AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes); + if (docs != null) { + maindoc.addSubDocuments(docs); + if (docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } + } + } catch (final Parser.Failure e) { + throw e; + } catch (final IOException | RuntimeException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + return new Document[] { maindoc }; + } + + /** + * Create the main parsed document for the compressed document at the given URL + * and Media type + * + * @param location the parsed resource URL + * @param mimeType the media type of the resource + * @param charset the charset name if known + * @param parser an instance of CompressorParser that is registered as the + * parser origin of the document + * @return a Document instance + */ + protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, + final AbstractCompressorParser parser) { + final String filename = location.getFileName(); + return new Document(location, mimeType, charset, parser, null, null, + AbstractParser + .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title + null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); + } + + /** + * Parse content in an open stream uncompressing on the fly a compressed + * resource. + * + * @param location the URL of the compressed resource + * @param charset the charset name if known + * @param ignoreClassNames an eventual set of CSS class names whose matching + * html elements content should be ignored + * @param timezoneOffset the local time zone offset + * @param compressedInStream an open stream uncompressing on the fly the + * compressed content + * @param maxLinks the maximum total number of links to parse and add + * to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with empty + * or null text. + * @throws Parser.Failure when the parser processing failed + */ + protected Document[] parseCompressedInputStream( + final DigestURL location, + final String charset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final int timezoneOffset, final int depth, + final CompressorInputStream compressedInStream, + final int maxLinks, + final long maxBytes) throws Failure { + final String compressedFileName = location.getFileName(); + final String contentfilename = getUncompressedFilename(compressedFileName); + final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); + try { + /* + * Use the uncompressed file name for sub parsers to not unnecessarily use again + * this same uncompressing parser + */ + final String locationPath = location.getPath(); + final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + + contentfilename; + final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), + location.getPort(), contentPath); + + /* + * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on + * compressed content + */ + return TextParser.parseWithLimits( + contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth, + -1, compressedInStream, maxLinks, maxBytes); + } catch (final MalformedURLException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + } + + @Override + public boolean isParseWithLimitsSupported() { + return true; + } } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 15a63a41b..5eaed9a1d 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -45,6 +45,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.FileUtils; /** @@ -52,7 +53,7 @@ import net.yacy.kelondro.util.FileUtils; * Unzips and parses the content and adds it to the created main document */ public class bzipParser extends AbstractParser implements Parser { - + public bzipParser() { super("Bzip 2 UNIX Compressed File Parser"); this.SUPPORTED_EXTENSIONS.add("bz2"); @@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) @@ -99,25 +101,25 @@ public class bzipParser extends AbstractParser implements Parser { out = null; } catch(Exception e) { - if (tempFile != null) { - FileUtils.deletedelete(tempFile); - } - throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); + if (tempFile != null) { + FileUtils.deletedelete(tempFile); + } + throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); } finally { - if(zippedContent != null) { - try { - zippedContent.close(); - } catch (IOException ignored) { - log.warn("Could not close bzip input stream"); - } - } - if(out != null) { - try { - out.close(); - } catch (IOException e) { - throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); - } - } + if(zippedContent != null) { + try { + zippedContent.close(); + } catch (IOException ignored) { + log.warn("Could not close bzip input stream"); + } + } + if(out != null) { + try { + out.close(); + } catch (IOException e) { + throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); + } + } } try { // create maindoc for this bzip container, register with supplied url & mime @@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser { // creating a new parser class to parse the unzipped content final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); - final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile); + final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile); if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; @@ -140,7 +142,7 @@ public class bzipParser extends AbstractParser implements Parser { @Override public boolean isParseWithLimitsSupported() { - return true; + return true; } /** @@ -151,9 +153,9 @@ public class bzipParser extends AbstractParser implements Parser { * @param parser instance of bzipParser that is registered as the parser origin of the document * @return a Document instance */ - public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) { - final String filename = location.getFileName(); - Document maindoc = new Document( + public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) { + final String filename = location.getFileName(); + Document maindoc = new Document( location, mimeType, charset, @@ -172,49 +174,48 @@ public class bzipParser extends AbstractParser implements Parser { null, false, new Date()); - return maindoc; - } - - /** - * Parse content in an open stream uncompressing on the fly a bzipped resource. - * @param location the URL of the bzipped resource - * @param charset the charset name if known - * @param timezoneOffset the local time zone offset - * @param compressedInStream an open stream uncompressing on the fly the compressed content - * @param maxLinks - * the maximum total number of links to parse and add to the - * result documents - * @param maxBytes - * the maximum number of content bytes to process - * @return a list of documents that result from parsing the source, with - * empty or null text. - * @throws Parser.Failure - * when the parser processing failed - */ - public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, - final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { + return maindoc; + } + + /** + * Parse content in an open stream uncompressing on the fly a bzipped resource. + * @param location the URL of the bzipped resource + * @param charset the charset name if known + * @param timezoneOffset the local time zone offset + * @param compressedInStream an open stream uncompressing on the fly the compressed content + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with + * empty or null text. + * @throws Parser.Failure + * when the parser processing failed + */ + public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, + final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { // creating a new parser class to parse the unzipped content - final String compressedFileName = location.getFileName(); + final String compressedFileName = location.getFileName(); final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); try { - /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ - final String locationPath = location.getPath(); - final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; - final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); - - /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ - return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); - } catch (MalformedURLException e) { - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); - } - } - - + /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ + final String locationPath = location.getPath(); + final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; + final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); + + /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ + return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); + } catch (MalformedURLException e) { + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + } + } + @Override public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, - final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) - throws Parser.Failure { + final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) + throws Parser.Failure { Document maindoc = null; BZip2CompressorInputStream zippedContent = null; try { @@ -222,23 +223,23 @@ public class bzipParser extends AbstractParser implements Parser { zippedContent = new BZip2CompressorInputStream(source); } catch(Exception e) { - throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); + throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); } - + try { // create maindoc for this bzip container, register with supplied url & mime maindoc = createMainDocument(location, mimeType, charset, this); // creating a new parser class to parse the unzipped content final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes); if (docs != null) { - maindoc.addSubDocuments(docs); - if(docs.length > 0 && docs[0].isPartiallyParsed()) { - maindoc.setPartiallyParsed(true); - } + maindoc.addSubDocuments(docs); + if(docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } } } catch (final Exception e) { if (e instanceof Parser.Failure) { - throw (Parser.Failure) e; + throw (Parser.Failure) e; } throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location); diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index d81d6d43a..dc4b58ae6 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -45,6 +45,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.FileUtils; /** @@ -52,8 +53,8 @@ import net.yacy.kelondro.util.FileUtils; * Unzips and parses the content and adds it to the created main document */ public class gzipParser extends AbstractParser implements Parser { - - private static final int DEFAULT_DEPTH = 999; + + private static final int DEFAULT_DEPTH = 999; public gzipParser() { super("GNU Zip Compressed Archive Parser"); @@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -84,10 +86,10 @@ public class gzipParser extends AbstractParser implements Parser { try { zippedContent = new GZIPInputStream(source); } catch(IOException e) { - /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening - * and eventually apply special error handling */ - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, - new GZIPOpeningStreamException()); + /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening + * and eventually apply special error handling */ + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, + new GZIPOpeningStreamException()); } try { int read = 0; @@ -103,32 +105,32 @@ public class gzipParser extends AbstractParser implements Parser { out.write(data, 0, read); } } catch(Exception e) { - if (tempFile != null) { - FileUtils.deletedelete(tempFile); - } - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + if (tempFile != null) { + FileUtils.deletedelete(tempFile); + } + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); } finally { - if(zippedContent != null) { - try { - zippedContent.close(); - } catch (IOException ignored) { - log.warn("Could not close gzip input stream"); - } - } - if(out != null) { - try { - out.close(); - } catch (IOException e) { - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); - } - } + if(zippedContent != null) { + try { + zippedContent.close(); + } catch (IOException ignored) { + log.warn("Could not close gzip input stream"); + } + } + if(out != null) { + try { + out.close(); + } catch (IOException e) { + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + } + } } try { maindoc = createMainDocument(location, mimeType, charset, this); // creating a new parser class to parse the unzipped content final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); - Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile); + Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile); if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; @@ -149,96 +151,96 @@ public class gzipParser extends AbstractParser implements Parser { * @param an instance of gzipParser that is registered as the parser origin of the document * @return a Document instance */ - public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) { - final String filename = location.getFileName(); - Document maindoc = new Document( - location, - mimeType, - charset, - parser, - null, - null, - AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title - null, - null, - null, - null, - 0.0d, 0.0d, - (Object) null, - null, - null, - null, - false, - new Date()); - return maindoc; - } - - /** - * Parse content in an open stream uncompressing on the fly a gzipped resource. - * @param location the URL of the gzipped resource - * @param charset the charset name if known - * @param timezoneOffset the local time zone offset - * @param compressedInStream an open stream uncompressing on the fly the compressed content - * @param maxLinks - * the maximum total number of links to parse and add to the - * result documents - * @param maxBytes - * the maximum number of content bytes to process - * @return a list of documents that result from parsing the source, with - * empty or null text. - * @throws Parser.Failure - * when the parser processing failed - */ - public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, - final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { + public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) { + final String filename = location.getFileName(); + Document maindoc = new Document( + location, + mimeType, + charset, + parser, + null, + null, + AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); + return maindoc; + } + + /** + * Parse content in an open stream uncompressing on the fly a gzipped resource. + * @param location the URL of the gzipped resource + * @param charset the charset name if known + * @param timezoneOffset the local time zone offset + * @param compressedInStream an open stream uncompressing on the fly the compressed content + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with + * empty or null text. + * @throws Parser.Failure + * when the parser processing failed + */ + public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, + final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { // creating a new parser class to parse the unzipped content - final String compressedFileName = location.getFileName(); + final String compressedFileName = location.getFileName(); final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); try { - /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ - final String locationPath = location.getPath(); - final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; - final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); - - /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ - return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); - } catch (MalformedURLException e) { - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); - } - } - + /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ + final String locationPath = location.getPath(); + final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; + final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); + + /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ + return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); + } catch (MalformedURLException e) { + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); + } + } + @Override public boolean isParseWithLimitsSupported() { - return true; + return true; } - + @Override public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, - final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) - throws Parser.Failure { + final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) + throws Parser.Failure { Document maindoc = null; GZIPInputStream zippedContent = null; try { - /* Only use in-memory stream here (no temporary file) : the parsers - * matching compressed content are expected to handle properly the maxBytes limit and terminate - * before an eventual OutOfMemory occurs */ + /* Only use in-memory stream here (no temporary file) : the parsers + * matching compressed content are expected to handle properly the maxBytes limit and terminate + * before an eventual OutOfMemory occurs */ zippedContent = new GZIPInputStream(source); } catch(IOException e) { - /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening - * and eventually apply special error handling */ - throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, - new GZIPOpeningStreamException()); + /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening + * and eventually apply special error handling */ + throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, + new GZIPOpeningStreamException()); } try { maindoc = createMainDocument(location, mimeType, charset, this); - + Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes); if (docs != null) { - maindoc.addSubDocuments(docs); - if(docs.length > 0 && docs[0].isPartiallyParsed()) { - maindoc.setPartiallyParsed(true); - } + maindoc.addSubDocuments(docs); + if(docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } } } catch (final Exception e) { throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); @@ -251,15 +253,15 @@ public class gzipParser extends AbstractParser implements Parser { */ public class GZIPOpeningStreamException extends Exception { - /** The serialization ID */ - private static final long serialVersionUID = 2824038185373304636L; + /** The serialization ID */ + private static final long serialVersionUID = 2824038185373304636L; + + public GZIPOpeningStreamException() { + super(); + } - public GZIPOpeningStreamException() { - super(); - } - - public GZIPOpeningStreamException(final String message) { - super(message); - } + public GZIPOpeningStreamException(final String message) { + super(message); + } } } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index a9e5e50a0..960957490 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -63,7 +63,7 @@ import net.yacy.document.parser.html.TransformerWriter; public class htmlParser extends AbstractParser implements Parser { - /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */ + /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */ private static final int DEFAULT_MAX_LINKS = 10000; public htmlParser() { @@ -108,42 +108,93 @@ public class htmlParser extends AbstractParser implements Parser { final int timezoneOffset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { - return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE); + return parseWithLimits( + location, + mimeType, + documentCharset, + TagValency.EVAL, + new HashSet<String>(), + vocscraper, + timezoneOffset, + sourceStream, + Integer.MAX_VALUE, + DEFAULT_MAX_LINKS, + Long.MAX_VALUE); } - + @Override public Document[] parse( final DigestURL location, final String mimeType, final String documentCharset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper vocscraper, final int timezoneOffset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { - return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE); + return parseWithLimits( + location, mimeType, + documentCharset, + defaultValency, + valencySwitchTagNames, + vocscraper, + timezoneOffset, + sourceStream, + Integer.MAX_VALUE, + DEFAULT_MAX_LINKS, + Long.MAX_VALUE); } @Override public boolean isParseWithLimitsSupported() { - return true; + return true; } @Override - public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, - final Set<String> ignore_class_name, final VocabularyScraper vocscraper, - final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes) - throws Failure { - return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes); + public Document[] parseWithLimits( + final DigestURL location, + final String mimeType, + final String documentCharset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final VocabularyScraper vocscraper, + final int timezoneOffset, + final InputStream sourceStream, + final int maxLinks, + final long maxBytes) + throws Failure { + return parseWithLimits( + location, + mimeType, + documentCharset, + defaultValency, + valencySwitchTagNames, + vocscraper, + timezoneOffset, + sourceStream, + maxLinks, + maxLinks, + maxBytes); } - private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper, - final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes) - throws Failure { + private Document[] parseWithLimits( + final DigestURL location, + final String mimeType, + final String documentCharset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final VocabularyScraper vocscraper, + final int timezoneOffset, + final InputStream sourceStream, + final int maxAnchors, + final int maxLinks, + final long maxBytes) + throws Failure { try { // first get a document from the parsed html Charset[] detectedcharsetcontainer = new Charset[]{null}; - ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes); + ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes); // parseToScraper also detects/corrects/sets charset from html content tag final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); Document documentSnapshot = null; @@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser { // and create a sub-document for snapshot page (which will be merged by loader) // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler if (location.getRef() != null && location.getRef().startsWith("!")) { - documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); + documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both) if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) { - documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); + documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); } } } catch (Exception ex1) { // ignore any exception for any issue with snapshot @@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException { + public static ContentScraper parseToScraper( + final DigestURL location, + final String documentCharset, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, + final VocabularyScraper vocabularyScraper, + final int timezoneOffset, + final String input, + final int maxAnchors, + final int maxLinks) throws IOException { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream sourceStream; try { @@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser { } ContentScraper scraper; // for this static methode no need to init local this.scraperObject try { - scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE); + scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE); } catch (Failure e) { throw new IOException(e.getMessage()); } @@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser { public static ContentScraper parseToScraper( final DigestURL location, final String documentCharset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper vocabularyScraper, final Charset[] detectedcharsetcontainer, final int timezoneOffset, @@ -264,7 +325,7 @@ public class htmlParser extends AbstractParser implements Parser { final int maxAnchors, final int maxLinks, final long maxBytes) throws Parser.Failure, IOException { - + // make a scraper String charset = null; @@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser { htmlFilter = new ScraperInputStream( sourceStream, documentCharset, - ignore_class_name, - TagValency.EVAL, + valencySwitchTagNames, + defaultValency, vocabularyScraper, location, false, @@ -325,26 +386,26 @@ public class htmlParser extends AbstractParser implements Parser { location, maxAnchors, maxLinks, - ignore_class_name, + valencySwitchTagNames, TagValency.EVAL, vocabularyScraper, timezoneOffset); final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available()))); try { - final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); - final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]); - final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars); + final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); + final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]); + final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars); if(copiedChars > maxChars) { - /* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */ - scraper.setContentSizeLimitExceeded(true); + /* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */ + scraper.setContentSizeLimitExceeded(true); } else if(copiedChars == maxChars) { - /* Exactly maxChars limit reached : let's check if more to read remain. */ - if(sourceReader.read() >= 0) { - scraper.setContentSizeLimitExceeded(true); - } + /* Exactly maxChars limit reached : let's check if more to read remain. */ + if(sourceReader.read() >= 0) { + scraper.setContentSizeLimitExceeded(true); + } } } catch (final IOException e) { - throw new Parser.Failure("IO error:" + e.getMessage(), location); + throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { writer.flush(); //sourceStream.close(); keep open for multipe parsing (close done by caller) @@ -456,9 +517,10 @@ public class htmlParser extends AbstractParser implements Parser { * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot */ private Document parseAlternativeSnapshot( - final DigestURL location, final String mimeType, final String documentCharset, - final Set<String> ignore_class_name, final VocabularyScraper vocscraper, - final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) { + final DigestURL location, final String mimeType, final String documentCharset, + final TagValency defaultValency, final Set<String> valencySwitchTagNames, + final VocabularyScraper vocscraper, + final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) { Document documentSnapshot = null; try { // construct url for case (1) with anchor @@ -476,17 +538,17 @@ public class htmlParser extends AbstractParser implements Parser { Charset[] detectedcharsetcontainer = new Charset[]{null}; InputStream snapshotStream = null; try { - snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); - ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes); + snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes); documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); } finally { - if(snapshotStream != null) { - try { - snapshotStream.close(); - } catch(IOException e) { - AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage()); - } - } + if(snapshotStream != null) { + try { + snapshotStream.close(); + } catch(IOException e) { + AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage()); + } + } } AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString()); } catch (IOException | Failure ex) { } diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 116cc9288..6b3f3bbb8 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -44,6 +44,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.FileUtils; import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; @@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final int timezoneOffset, final IInStream source) throws Parser.Failure, InterruptedException { @@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } catch (final IOException e) { throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location); } - final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset); + final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset); AbstractParser.log.fine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); @@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final int timezoneOffset, final byte[] source) throws Parser.Failure, InterruptedException { - return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source)); + return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source)); } @Override @@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); - return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())}; + return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())}; } catch (final IOException e) { throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location); } @@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser { private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; - private Set<String> ignore_class_name; + private final TagValency defaultValency; + private Set<String> valencySwitchTagNames; private final int timezoneOffset; public SZParserExtractCallback( @@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser { final IInArchive handler, final Document doc, final String prefix, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final int timezoneOffset) { super.Init(handler); this.log = logger; this.doc = doc; this.prefix = prefix; - this.ignore_class_name = ignore_class_name; + this.defaultValency = defaultValency; + this.valencySwitchTagNames = valencySwitchTagNames; this.timezoneOffset = timezoneOffset; } @@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // below for reversion of the effects final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray()); + theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray()); this.doc.addSubDocuments(theDocs); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index f1b7059c2..f0122c3cb 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -45,6 +45,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.FileUtils; // this is a new implementation of this parser idiom using multiple documents as result set @@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, InputStream source) throws Parser.Failure, InterruptedException { @@ -104,17 +106,17 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - /* - * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. - * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name. - * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the - * extension of the URL is still ".tar", thus incorrectly making the tar parser - * as a possible parser for the sub resource. - */ +/* + * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. + * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name. + * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the + * extension of the URL is still ".tar", thus incorrectly making the tar parser + * as a possible parser for the sub resource. + */ final DigestURL subLocation = new DigestURL(parentTarURL, name); - final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp); + final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp); if (subDocs == null) { - continue; + continue; } maindoc.addSubDocuments(subDocs); } catch (final Parser.Failure e) { @@ -130,146 +132,146 @@ public class tarParser extends AbstractParser implements Parser { return new Document[]{maindoc}; } - @Override - public boolean isParseWithLimitsSupported() { - return true; - } +@Override +public boolean isParseWithLimitsSupported() { +return true; +} - @Override - public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, - final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks, - final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { +@Override +public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, +final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks, +final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { - final DigestURL parentTarURL = createParentTarURL(location); +final DigestURL parentTarURL = createParentTarURL(location); - final TarArchiveInputStream tis = new TarArchiveInputStream(source); +final TarArchiveInputStream tis = new TarArchiveInputStream(source); - // create maindoc for this tar container - final Document maindoc = createMainDocument(location, mimeType, charset, this); +// create maindoc for this tar container +final Document maindoc = createMainDocument(location, mimeType, charset, this); - // loop through the elements in the tar file and parse every single file inside - TarArchiveEntry entry; - int totalProcessedLinks = 0; - while (true) { - try { - entry = tis.getNextTarEntry(); - if (entry == null) { - break; - } +// loop through the elements in the tar file and parse every single file inside +TarArchiveEntry entry; +int totalProcessedLinks = 0; +while (true) { +try { +entry = tis.getNextTarEntry(); +if (entry == null) { +break; +} - /* - * We are here sure at least one entry has still to be processed : let's check - * now the bytes limit as sub parsers applied on eventual previous entries may - * not support partial parsing and would have thrown a Parser.Failure instead of - * marking the document as partially parsed. - */ - if (tis.getBytesRead() >= maxBytes) { - maindoc.setPartiallyParsed(true); - break; - } +/* + * We are here sure at least one entry has still to be processed : let's check + * now the bytes limit as sub parsers applied on eventual previous entries may + * not support partial parsing and would have thrown a Parser.Failure instead of + * marking the document as partially parsed. + */ +if (tis.getBytesRead() >= maxBytes) { +maindoc.setPartiallyParsed(true); +break; +} - if (entry.isDirectory() || entry.getSize() <= 0) { - continue; - } - final String name = entry.getName(); - final int idx = name.lastIndexOf('.'); - final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : ""); - try { - /* - * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on - * compressed content - */ +if (entry.isDirectory() || entry.getSize() <= 0) { +continue; +} +final String name = entry.getName(); +final int idx = name.lastIndexOf('.'); +final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : ""); +try { +/* + * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on + * compressed content + */ - /* - * Create an appropriate sub location to prevent unwanted fallback to the - * tarparser on resources included in the archive. We use the tar file name as - * the parent sub path. Example : http://host/archive.tar/name. Indeed if we - * create a sub location with a '#' separator such as - * http://host/archive.tar#name, the extension of the URL is still ".tar", thus - * incorrectly making the tar parser as a possible parser for the sub resource. - */ - final DigestURL subLocation = new DigestURL(parentTarURL, name); - final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999, - entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead()); +/* + * Create an appropriate sub location to prevent unwanted fallback to the + * tarparser on resources included in the archive. We use the tar file name as + * the parent sub path. Example : http://host/archive.tar/name. Indeed if we + * create a sub location with a '#' separator such as + * http://host/archive.tar#name, the extension of the URL is still ".tar", thus + * incorrectly making the tar parser as a possible parser for the sub resource. + */ +final DigestURL subLocation = new DigestURL(parentTarURL, name); +final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999, +entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead()); - /* - * If the parser(s) did not consume all bytes in the entry, these ones will be - * skipped by the next call to getNextTarEntry() - */ - if (subDocs == null) { - continue; - } - maindoc.addSubDocuments(subDocs); - for (Document subDoc : subDocs) { - if (subDoc.getAnchors() != null) { - totalProcessedLinks += subDoc.getAnchors().size(); - } - } - /* - * Check if a limit has been exceeded (we are sure to pass here when maxLinks - * has been exceeded as this limit require parser support for partial parsing to - * be detected) - */ - if (subDocs[0].isPartiallyParsed()) { - maindoc.setPartiallyParsed(true); - break; - } - } catch (final Parser.Failure e) { - AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); - } - } catch (final IOException e) { - AbstractParser.log.warn("tar parser:" + e.getMessage()); - break; - } - } - return new Document[] { maindoc }; - } +/* + * If the parser(s) did not consume all bytes in the entry, these ones will be + * skipped by the next call to getNextTarEntry() + */ +if (subDocs == null) { +continue; +} +maindoc.addSubDocuments(subDocs); +for (Document subDoc : subDocs) { +if (subDoc.getAnchors() != null) { +totalProcessedLinks += subDoc.getAnchors().size(); +} +} +/* + * Check if a limit has been exceeded (we are sure to pass here when maxLinks + * has been exceeded as this limit require parser support for partial parsing to + * be detected) + */ +if (subDocs[0].isPartiallyParsed()) { +maindoc.setPartiallyParsed(true); +break; +} +} catch (final Parser.Failure e) { +AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); +} +} catch (final IOException e) { +AbstractParser.log.warn("tar parser:" + e.getMessage()); +break; +} +} +return new Document[] { maindoc }; +} - /** - * Generate a parent URL to use for generating sub URLs on tar archive entries. - * - * @param tarURL - * the URL of the tar archive - * @return an URL ending with a "/" suitable as a base URL for archive entries - */ - private DigestURL createParentTarURL(final DigestURL tarURL) { - String locationStr = tarURL.toNormalform(false); - if (!locationStr.endsWith("/")) { - locationStr += "/"; - } - DigestURL parentTarURL; - try { - parentTarURL = new DigestURL(locationStr); - } catch (MalformedURLException e1) { - /* This should not happen */ - parentTarURL = tarURL; - } - return parentTarURL; - } +/** + * Generate a parent URL to use for generating sub URLs on tar archive entries. + * + * @param tarURL + * the URL of the tar archive + * @return an URL ending with a "/" suitable as a base URL for archive entries + */ +private DigestURL createParentTarURL(final DigestURL tarURL) { +String locationStr = tarURL.toNormalform(false); +if (!locationStr.endsWith("/")) { +locationStr += "/"; +} +DigestURL parentTarURL; +try { +parentTarURL = new DigestURL(locationStr); +} catch (MalformedURLException e1) { +/* This should not happen */ +parentTarURL = tarURL; +} +return parentTarURL; +} - /** - * Create the main resulting parsed document for a tar container - * - * @param location - * the parsed resource URL - * @param mimeType - * the media type of the resource - * @param charset - * the charset name if known - * @param parser - * instance of tarParser that is registered as the parser origin of - * the document - * @return a Document instance - */ - public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, - final tarParser parser) { - final String filename = location.getFileName(); - final Document maindoc = new Document(location, mimeType, charset, parser, null, null, - AbstractParser - .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title - null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); - return maindoc; - } +/** + * Create the main resulting parsed document for a tar container + * + * @param location + * the parsed resource URL + * @param mimeType + * the media type of the resource + * @param charset + * the charset name if known + * @param parser + * instance of tarParser that is registered as the parser origin of + * the document + * @return a Document instance + */ +public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, +final tarParser parser) { +final String filename = location.getFileName(); +final Document maindoc = new Document(location, mimeType, charset, parser, null, null, +AbstractParser +.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title +null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); +return maindoc; +} public final static boolean isTar(File f) { if (!f.exists() || f.length() < 0x105) return false; diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 7d47131a5..c994f096f 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -39,6 +39,7 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - final Set<String> ignore_class_name, + final TagValency defaultValency, + final Set<String> valencySwitchTagNames, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) @@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp); + final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp); if (docs == null) continue; maindoc.addSubDocuments(docs); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index 66750684a..44b944fe8 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -626,6 +626,7 @@ public class Crawler_p { cachePolicy, collection, agentName, + TagValency.EVAL, ignoreclassname, new VocabularyScraper(vocabulary_scraper), timezoneOffset); diff --git a/source/net/yacy/htroot/QuickCrawlLink_p.java b/source/net/yacy/htroot/QuickCrawlLink_p.java index adf0497d1..4230b69ea 100644 --- a/source/net/yacy/htroot/QuickCrawlLink_p.java +++ b/source/net/yacy/htroot/QuickCrawlLink_p.java @@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Request; +import net.yacy.document.parser.html.TagValency; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; @@ -161,7 +162,7 @@ public class QuickCrawlLink_p { CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName, - null, null, + TagValency.EVAL, null, null, timezoneOffset); sb.crawler.putActive(pe.handle().getBytes(), pe); } catch (final Exception e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 0fa5e537a..1d1439482 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -709,7 +709,16 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.getContentType()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); + documents = TextParser.parseSource( + url, + responseHeader.getContentType(), + responseHeader.getCharacterEncoding(), + response.profile().defaultValency(), + response.profile().valencySwitchTagNames(), + response.profile().scraper(), + timezoneOffset, + response.depth(), + response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index a7a971ce5..34ebfcc94 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch { documents = TextParser.genericParseSource(new AnchorURL(response.url()), response.getMimeType(), response.getCharacterEncoding(), - response.profile().ignoreDivClassName(), + response.profile().defaultValency(), + response.profile().valencySwitchTagNames(), response.profile().scraper(), response.profile().timezoneOffset(), response.depth(), @@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch { new AnchorURL(response.url()), response.getMimeType(), response.getCharacterEncoding(), - response.profile().ignoreDivClassName(), + response.profile().defaultValency(), + response.profile().valencySwitchTagNames(), response.profile().scraper(), response.profile().timezoneOffset(), response.depth(), diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index e7334a91b..db0619381 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -45,6 +45,7 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.TagValency; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.WebgraphConfiguration; @@ -162,24 +163,24 @@ public class DocumentIndex extends Segment { } InputStream sourceStream = null; try { - sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); - documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream); + sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); } finally { - if(sourceStream != null) { - try { - sourceStream.close(); - } catch(IOException e) { - ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage()); - } - } + if(sourceStream != null) { + try { + sourceStream.close(); + } catch(IOException e) { + ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage()); + } + } } //Document document = Document.mergeDocuments(url, null, documents); final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; int c = 0; for ( final Document document : documents ) { - if (document == null) continue; + if (document == null) continue; final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0); rows[c++] = super.storeDocument(