crawl profile adoption to new tag valency attribute

pull/554/head
Michael Christen 2 years ago
parent 5acd98f4da
commit 4304e07e6f

@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
@ -60,7 +61,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
public final class CrawlSwitchboard { public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob"; public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
@ -75,7 +76,7 @@ public final class CrawlSwitchboard {
public static Set<String> DEFAULT_PROFILES = new HashSet<String>(); public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static { static {
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP); DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW); DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
DEFAULT_PROFILES.add(CRAWL_PROFILE_RECRAWL_JOB); DEFAULT_PROFILES.add(CRAWL_PROFILE_RECRAWL_JOB);
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
@ -93,11 +94,11 @@ public final class CrawlSwitchboard {
// Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */ // Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */
/** /**
* The default recrawl time cycle in minutes for recrawl jobs. The recrawl date * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
* limit can be set up by the recrawl job selection query, but a default limit * limit can be set up by the recrawl job selection query, but a default limit
* prevent unwanted overload on targets) * prevent unwanted overload on targets)
*/ */
public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@ -139,7 +140,7 @@ public final class CrawlSwitchboard {
try { try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException | SpaceExceededException | RuntimeException e ) { } catch (final IOException | SpaceExceededException | RuntimeException e ) {
ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e); ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e);
p = null; p = null;
} }
if ( p == null ) { if ( p == null ) {
@ -275,16 +276,15 @@ public final class CrawlSwitchboard {
public RowHandleSet getURLHashes(final byte[] profileKey) { public RowHandleSet getURLHashes(final byte[] profileKey) {
return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey)); return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey));
} }
private void initActiveCrawlProfiles() { private void initActiveCrawlProfiles() {
final Switchboard sb = Switchboard.getSwitchboard(); final Switchboard sb = Switchboard.getSwitchboard();
// generate new default entry for deep auto crawl // generate new default entry for deep auto crawl
this.defaultAutocrawlDeepProfile = this.defaultAutocrawlDeepProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_DEEP, CRAWL_PROFILE_AUTOCRAWL_DEEP,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
@ -308,12 +308,13 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP, "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName, ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()), UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
this.defaultAutocrawlDeepProfile); this.defaultAutocrawlDeepProfile);
// generate new default entry for shallow auto crawl // generate new default entry for shallow auto crawl
this.defaultAutocrawlShallowProfile = this.defaultAutocrawlShallowProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_SHALLOW, CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
@ -341,6 +342,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW, "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName, ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -364,7 +366,7 @@ public final class CrawlSwitchboard {
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1, -1,
false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true),
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true, true,
@ -373,6 +375,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY, "robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName, ClientIdentification.yacyProxyAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -405,6 +408,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE, "robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName, ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -437,6 +441,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -469,6 +474,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -509,6 +515,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName, ClientIdentification.browserAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -541,6 +548,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -573,6 +581,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -605,6 +614,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE, "robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -640,6 +650,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null, null, null,
0); 0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);

@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0); ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL, null, null, 0);
return profile; return profile;
} }

@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
@ -69,19 +70,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
/** Regular expression pattern matching everything */ /** Regular expression pattern matching everything */
public static final String MATCH_ALL_STRING = ".*"; public static final String MATCH_ALL_STRING = ".*";
/** Regular expression pattern matching nothing */ /** Regular expression pattern matching nothing */
public static final String MATCH_NEVER_STRING = ""; public static final String MATCH_NEVER_STRING = "";
/** Empty Solr query */ /** Empty Solr query */
public static final String SOLR_EMPTY_QUERY = ""; public static final String SOLR_EMPTY_QUERY = "";
/** Match all Solr query */ /** Match all Solr query */
public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY; public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY;
/** Regular expression matching everything */ /** Regular expression matching everything */
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
/** Regular expression matching nothing */ /** Regular expression matching nothing */
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
@ -126,14 +127,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"), INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"), INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"), COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"), DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"),
VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"), SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent"); TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
public static final int BOOLEAN = 0; public static final int BOOLEAN = 0;
public static final int INTEGER = 1; public static final int INTEGER = 1;
public static final int STRING = 2; public static final int STRING = 2;
public final String key, label; public final String key, label;
public final boolean readonly; public final boolean readonly;
public final int type; public final int type;
@ -143,39 +145,39 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.type = type; this.type = type;
this.label = label; this.label = label;
} }
@Override @Override
public String toString() { public String toString() {
return this.key; return this.key;
} }
} }
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
/** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */ /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustMatch = null; private Pattern crawlerOriginUrlMustMatch = null;
/** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */ /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustNotMatch = null; private Pattern crawlerOriginUrlMustNotMatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null; private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
/** Pattern on the media type documents must match before being indexed /** Pattern on the media type documents must match before being indexed
* @see CollectionSchema#content_type */ * @see CollectionSchema#content_type */
private Pattern indexMediaTypeMustMatch = null; private Pattern indexMediaTypeMustMatch = null;
/** Pattern on the media type documents must not match before being indexed /** Pattern on the media type documents must not match before being indexed
* @see CollectionSchema#content_type */ * @see CollectionSchema#content_type */
private Pattern indexMediaTypeMustNotMatch = null; private Pattern indexMediaTypeMustNotMatch = null;
private Pattern snapshotsMustnotmatch = null; private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms; private final Map<String, AtomicInteger> doms;
private final Set<String> ignore_class_name; private final TagValency defaultValency;
private final Set<String> valencySwitchTagNames;
private final VocabularyScraper scraper; private final VocabularyScraper scraper;
/** /**
@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections, final String collections,
final String userAgentName, final String userAgentName,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset) { final int timezoneOffset) {
super(40); super(40);
@ -252,40 +255,42 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.NAME.key, name); put(CrawlAttribute.NAME.key, name);
put(CrawlAttribute.AGENT_NAME.key, userAgentName); put(CrawlAttribute.AGENT_NAME.key, userAgentName);
put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true); put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch); put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch); put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch); put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch); put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch); put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(CrawlAttribute.DEPTH.key, depth); put(CrawlAttribute.DEPTH.key, depth);
put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL); put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL);
put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages); put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages);
put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?' put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?'
put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames
put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow); put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow);
put(CrawlAttribute.INDEX_TEXT.key, indexText); put(CrawlAttribute.INDEX_TEXT.key, indexText);
put(CrawlAttribute.INDEX_MEDIA.key, indexMedia); put(CrawlAttribute.INDEX_MEDIA.key, indexMedia);
put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache); put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache);
put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing); put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing);
put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth); put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth);
put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage); put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage);
put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld); put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld);
put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch); put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString()); put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the ignore_class_name and scraper information into a JSON Array // we transform the ignore_class_name and scraper information into a JSON Array
this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name; this.defaultValency = defaultValency;
String jsonString = new JSONArray(ignore_class_name).toString(); this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString); String jsonString = new JSONArray(valencySwitchTagNames).toString();
put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper; this.scraper = scraper == null ? new VocabularyScraper() : scraper;
jsonString = this.scraper.toString(); jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size()); super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext); if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key); String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
JSONArray a; JSONArray a;
if(jsonString == null) { if (jsonString == null) {
a = new JSONArray(); a = new JSONArray();
} else { } else {
try { try {
@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
a = new JSONArray(); a = new JSONArray();
} }
} }
this.ignore_class_name = new HashSet<String>(); this.valencySwitchTagNames = new HashSet<String>();
for (int i = 0; i < a.length(); i++) try { for (int i = 0; i < a.length(); i++) try {
this.ignore_class_name.add(a.getString(i)); this.valencySwitchTagNames.add(a.getString(i));
} catch (JSONException e) {} } catch (JSONException e) {}
jsonString = ext.get(CrawlAttribute.SCRAPER.key); jsonString = ext.get(CrawlAttribute.SCRAPER.key);
if (jsonString == null || jsonString.length() == 0) { if (jsonString == null || jsonString.length() == 0) {
@ -336,14 +343,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
} }
public Set<String> ignoreDivClassName() { public TagValency defaultValency() {
return this.ignore_class_name; return this.defaultValency;
}
public Set<String> valencySwitchTagNames() {
return this.valencySwitchTagNames;
} }
public VocabularyScraper scraper() { public VocabularyScraper scraper() {
return this.scraper; return this.scraper;
} }
public void domInc(final String domain) { public void domInc(final String domain) {
if (domain == null) return; // may be correct for file system crawls if (domain == null) return; // may be correct for file system crawls
final AtomicInteger dp = this.doms.get(domain); final AtomicInteger dp = this.doms.get(domain);
@ -427,7 +438,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
//if (r == null) return null; //if (r == null) return null;
return r; return r;
} }
private Map<String, Pattern> cmap = null; private Map<String, Pattern> cmap = null;
/** /**
@ -440,7 +451,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.cmap = collectionParser(r); this.cmap = collectionParser(r);
return this.cmap; return this.cmap;
} }
public static Map<String, Pattern> collectionParser(String collectionString) { public static Map<String, Pattern> collectionParser(String collectionString) {
if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>(); if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>();
String[] cs = CommonPattern.COMMA.split(collectionString); String[] cs = CommonPattern.COMMA.split(collectionString);
@ -470,7 +481,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String r = get(CrawlAttribute.COLLECTIONS.key); final String r = get(CrawlAttribute.COLLECTIONS.key);
return r == null || r.length() == 0 || "user".equals(r) ? name() : r; return r == null || r.length() == 0 || "user".equals(r) ? name() : r;
} }
/** /**
* Gets the regex which must be matched by URLs in order to be crawled. * Gets the regex which must be matched by URLs in order to be crawled.
* @return regex which must be matched * @return regex which must be matched
@ -484,7 +495,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.crawlerurlmustmatch; return this.crawlerurlmustmatch;
} }
/** /**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with * Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs, * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
@ -516,7 +527,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.crawlerurlmustnotmatch; return this.crawlerurlmustnotmatch;
} }
/** /**
* Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
* *
@ -538,7 +549,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.crawlerOriginUrlMustMatch; return this.crawlerOriginUrlMustMatch;
} }
/** /**
* Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
* *
@ -601,7 +612,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (list.length == 1 && list.length == 0) list = new String[0]; if (list.length == 1 && list.length == 0) list = new String[0];
return list; return list;
} }
/** /**
* If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0) * If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0)
* @return regex which must be matched * @return regex which must be matched
@ -643,7 +654,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.indexurlmustnotmatch; return this.indexurlmustnotmatch;
} }
/** /**
* Gets the regex which must be matched by URLs in order to be indexed. * Gets the regex which must be matched by URLs in order to be indexed.
* @return regex which must be matched * @return regex which must be matched
@ -671,7 +682,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.indexcontentmustnotmatch; return this.indexcontentmustnotmatch;
} }
/** /**
* Get the Pattern on media type that documents must match in order to be indexed * Get the Pattern on media type that documents must match in order to be indexed
* *
@ -693,7 +704,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.indexMediaTypeMustMatch; return this.indexMediaTypeMustMatch;
} }
/** /**
* Get the Pattern on media type that documents must not match in order to be indexed * Get the Pattern on media type that documents must not match in order to be indexed
* *
@ -715,9 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
return this.indexMediaTypeMustNotMatch; return this.indexMediaTypeMustNotMatch;
} }
/** /**
* Gets depth of crawl job (or height of the tree which will be * Gets depth of crawl job (or height of the tree which will be
* created by the crawler). * created by the crawler).
@ -743,7 +752,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
/** /**
* @return true when the crawler must always cross check the eventual URL file * @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is * extension against the actual Media Type, even when file extension is
@ -772,7 +781,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public void setCacheStrategy(final CacheStrategy newStrategy) { public void setCacheStrategy(final CacheStrategy newStrategy) {
put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString()); put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
} }
/** /**
* Gets the minimum date that an entry must have to be re-crawled. * Gets the minimum date that an entry must have to be re-crawled.
* @return time in ms representing a date * @return time in ms representing a date
@ -847,13 +856,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean remoteIndexing() { public boolean remoteIndexing() {
final String r = get(CrawlAttribute.REMOTE_INDEXING.key); final String r = get(CrawlAttribute.REMOTE_INDEXING.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public int snapshotMaxdepth() { public int snapshotMaxdepth() {
final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key); final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key);
if (r == null) return -1; if (r == null) return -1;
@ -866,7 +875,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return -1; return -1;
} }
} }
public boolean snapshotLoadImage() { public boolean snapshotLoadImage() {
final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key); final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key);
if (r == null) return false; if (r == null) return false;
@ -878,7 +887,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public Pattern snapshotsMustnotmatch() { public Pattern snapshotsMustnotmatch() {
if (this.snapshotsMustnotmatch == null) { if (this.snapshotsMustnotmatch == null) {
final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key); final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key);
@ -887,7 +896,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
} }
return this.snapshotsMustnotmatch; return this.snapshotsMustnotmatch;
} }
public int timezoneOffset() { public int timezoneOffset() {
final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key); final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key);
@ -898,7 +907,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0; return 0;
} }
} }
/** /**
* get a recrawl date for a given age in minutes * get a recrawl date for a given age in minutes
* @param oldTimeMinutes * @param oldTimeMinutes
@ -946,7 +955,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString(); return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString();
} }
public boolean isPushCrawlProfile() { public boolean isPushCrawlProfile() {
return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB); return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB);
} }
@ -1008,7 +1017,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
int i = 0; int i = 0;
if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) { if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) {
String item; String item;
@ -1021,7 +1030,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
} }
public static void main(String[] args) { public static void main(String[] args) {
// test to convert the key set from set to string and back // test to convert the key set from set to string and back
Set<String> a = new HashSet<>(); Set<String> a = new HashSet<>();

@ -48,6 +48,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
public class Response { public class Response {
@ -853,7 +854,7 @@ public class Response {
// 4) proxy-load (initiator is "------------") // 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash) // 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator) // 6) local fetching for global crawling (other known or unknown initiator)
// 7) local surrogates processing (can not be known here : crawl profile is required) // 7) local surrogates processing (can not be known here : crawl profile is required)
EventOrigin processCase = EventOrigin.UNKNOWN; EventOrigin processCase = EventOrigin.UNKNOWN;
// FIXME the equals seems to be incorrect: String.equals(boolean) // FIXME the equals seems to be incorrect: String.equals(boolean)
if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) { if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) {
@ -873,9 +874,13 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType()); final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try { try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content); return TextParser.parseSource(
url(), this.responseHeader == null ? null : this.responseHeader.getContentType(),
this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(),
TagValency.EVAL, new HashSet<String>(),
new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch(Parser.Failure e) { } catch(Parser.Failure e) {
throw e; throw e;
} catch (final Exception e) { } catch (final Exception e) {
return null; return null;
} }

@ -32,6 +32,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.TagValency;
public abstract class AbstractParser implements Parser { public abstract class AbstractParser implements Parser {
@ -41,20 +42,20 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>(); protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>(); protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name; private final String name;
/** /**
* initialize a parser with a name * initialize a parser with a name
* @param name * @param name
*/ */
public AbstractParser(final String name) { public AbstractParser(final String name) {
this.name = name; this.name = name;
} }
/* /*
* The following abstract implementations create a circular call which would cause an endless loop when called. * The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class. * They are both here because one of them must be overridden by the implementing class.
*/ */
@Override @Override
public Document[] parse( public Document[] parse(
DigestURL url, DigestURL url,
@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser {
int timezoneOffset, int timezoneOffset,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException { ) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source); return parse(url, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source);
} }
@Override @Override
@ -72,15 +73,15 @@ public abstract class AbstractParser implements Parser {
DigestURL url, DigestURL url,
String mimeType, String mimeType,
String charset, String charset,
Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper, VocabularyScraper scraper,
int timezoneOffset, int timezoneOffset,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException { ) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, scraper, timezoneOffset, source); return parse(url, mimeType, charset, scraper, timezoneOffset, source);
} }
/* /*
* The following abstract implementations create a circular call which would cause an endless loop when called. * The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class. * They are both here because one of them must be overridden by the implementing class.
@ -88,32 +89,33 @@ public abstract class AbstractParser implements Parser {
@Override @Override
public Document[] parseWithLimits( public Document[] parseWithLimits(
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source, final InputStream source,
final int maxLinks, final int maxLinks,
final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException { final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes); return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
} }
@Override @Override
public Document[] parseWithLimits( public Document[] parseWithLimits(
DigestURL location, DigestURL location,
String mimeType, String mimeType,
String charset, String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
VocabularyScraper scraper, final Set<String> valencySwitchTagNames,
int timezoneOffset, VocabularyScraper scraper,
InputStream source, int timezoneOffset,
int maxLinks, InputStream source,
long maxBytes) int maxLinks,
throws Failure, InterruptedException, UnsupportedOperationException { long maxBytes)
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes); throws Failure, InterruptedException, UnsupportedOperationException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
} }
/** /**
* return the name of the parser * return the name of the parser
*/ */
@ -164,12 +166,11 @@ public abstract class AbstractParser implements Parser {
if (t != null) c.add(t); if (t != null) c.add(t);
return c; return c;
} }
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
/* Please override on subclasses when parseWithLimits is supported */ /* Please override on subclasses when parseWithLimits is supported */
return false; return false;
} }
} }

@ -28,6 +28,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.parser.html.TagValency;
public interface Parser { public interface Parser {
@ -63,72 +64,87 @@ public interface Parser {
int timezoneOffset, int timezoneOffset,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException; ) throws Parser.Failure, InterruptedException;
public Document[] parse( public Document[] parse(
DigestURL url, DigestURL url,
String mimeType, String mimeType,
String charset, String charset,
Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper, VocabularyScraper scraper,
int timezoneOffset, int timezoneOffset,
InputStream source InputStream source
) throws Parser.Failure, InterruptedException; ) throws Parser.Failure, InterruptedException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
* or when maxBytes content bytes have been processed, thus potentially
* resulting in partially parsed documents (with
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(
DigestURL url,
String mimeType,
String charset,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/** /**
* Parse an input stream, eventually terminating processing when a total of * @return true when the parser implementation supports the
* maxLinks URLS (anchors, images links, media links...) have been reached, * parseWithLimits() operation.
* or when maxBytes content bytes have been processed, thus potentially */
* resulting in partially parsed documents (with public boolean isParseWithLimitsSupported();
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* @return true when the parser implementation supports the
* parseWithLimits() operation.
*/
public boolean isParseWithLimitsSupported();
// methods to that shall make it possible to put Parser objects into a hashtable // methods to that shall make it possible to put Parser objects into a hashtable

@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser; import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException; import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser; import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser; import net.yacy.document.parser.mmParser;
@ -184,7 +185,8 @@ public final class TextParser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
@ -201,7 +203,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream); docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -218,7 +220,8 @@ public final class TextParser {
final DigestURL location, final DigestURL location,
String mimeType, String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
@ -236,7 +239,7 @@ public final class TextParser {
} }
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs; return docs;
} }
@ -248,7 +251,8 @@ public final class TextParser {
final DigestURL location, final DigestURL location,
String mimeType, String mimeType,
final String charset, final String charset,
final Set<String> ignoreClassNames, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
@ -261,14 +265,15 @@ public final class TextParser {
final Set<Parser> idioms = new HashSet<>(); final Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom); idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
} }
private static Document[] parseSource( private static Document[] parseSource(
final DigestURL location, final DigestURL location,
String mimeType, String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
@ -330,7 +335,7 @@ public final class TextParser {
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream); CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try { try {
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset, return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes); nonCloseInputStream, maxLinks, maxBytes);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes : /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -378,11 +383,11 @@ public final class TextParser {
int maxBytesToRead = -1; int maxBytesToRead = -1;
if(maxBytes < Integer.MAX_VALUE) { if(maxBytes < Integer.MAX_VALUE) {
/* Load at most maxBytes + 1 : /* Load at most maxBytes + 1 :
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */ - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
maxBytesToRead = (int)maxBytes + 1; maxBytesToRead = (int)maxBytes + 1;
} }
if(contentLength >= 0 && contentLength < maxBytesToRead) { if (contentLength >= 0 && contentLength < maxBytesToRead) {
maxBytesToRead = (int)contentLength; maxBytesToRead = (int)contentLength;
} }
@ -392,16 +397,23 @@ public final class TextParser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs; return docs;
} }
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset, public static Document[] parseSource(
final Set<String> ignore_class_name, final DigestURL location,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength, String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream) throws Parser.Failure { final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream, return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE); Integer.MAX_VALUE, Long.MAX_VALUE);
} }
@ -424,10 +436,19 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text. * @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames, public static Document[] parseWithLimits(
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, final DigestURL location,
String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream,
int maxLinks,
long maxBytes) throws Parser.Failure{ long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
@ -449,10 +470,11 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text. * @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, public static Document[] parseWithLimits(
final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{ long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
@ -475,7 +497,8 @@ public final class TextParser {
final String mimeType, final String mimeType,
final Parser parser, final Parser parser,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream sourceStream, final InputStream sourceStream,
@ -491,11 +514,11 @@ public final class TextParser {
try { try {
final Document[] docs; final Document[] docs;
if(parser.isParseWithLimitsSupported()) { if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else { } else {
/* Parser do not support partial parsing within limits : let's control it here*/ /* Parser do not support partial parsing within limits : let's control it here*/
final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes); final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource); docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource);
} }
return docs; return docs;
} catch(final Parser.Failure e) { } catch(final Parser.Failure e) {
@ -524,7 +547,8 @@ public final class TextParser {
final String mimeType, final String mimeType,
final Set<Parser> parsers, final Set<Parser> parsers,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
@ -552,13 +576,13 @@ public final class TextParser {
} }
try { try {
if(parser.isParseWithLimitsSupported()) { if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes); docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else { } else {
/* Partial parsing is not supported by this parser : check content length now */ /* Partial parsing is not supported by this parser : check content length now */
if(sourceArray.length > maxBytes) { if(sourceArray.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location); throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
} }
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis); docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis);
} }
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&

@ -68,6 +68,7 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader; import net.yacy.document.content.SurrogateReader;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.NamePrefixThreadFactory; import net.yacy.kelondro.util.NamePrefixThreadFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -101,11 +102,11 @@ public class MediawikiImporter extends Thread implements Importer {
public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) { public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) {
super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")"); super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
this.sourcefile = sourcefile; this.sourcefile = sourcefile;
this.docsize = sourcefile.length(); this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L); this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir; this.targetdir = targetdir;
this.count = 0; this.count = 0;
this.start = 0; this.start = 0;
this.hostport = null; this.hostport = null;
@ -154,7 +155,7 @@ public class MediawikiImporter extends Thread implements Importer {
} }
@SuppressWarnings("resource") @SuppressWarnings("resource")
@Override @Override
public void run() { public void run() {
this.start = System.currentTimeMillis(); this.start = System.currentTimeMillis();
final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
@ -179,8 +180,8 @@ public class MediawikiImporter extends Thread implements Importer {
boolean page = false, text = false; boolean page = false, text = false;
String title = null; String title = null;
final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10); final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final ExecutorService service = Executors.newCachedThreadPool( final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer")); new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
final convertConsumer[] consumers = new convertConsumer[threads]; final convertConsumer[] consumers = new convertConsumer[threads];
final Future<?>[] consumerResults = (Future<?>[]) Array.newInstance(Future.class, threads); final Future<?>[] consumerResults = (Future<?>[]) Array.newInstance(Future.class, threads);
for (int i = 0; i < threads; i++) { for (int i = 0; i < threads; i++) {
@ -276,23 +277,23 @@ public class MediawikiImporter extends Thread implements Importer {
consumerResults[i].get(10000, TimeUnit.MILLISECONDS); consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
} }
} catch (final Exception e) { } catch (final Exception e) {
this.errorMessage = e.getMessage(); this.errorMessage = e.getMessage();
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} finally { } finally {
out.put(poison); // output thread condition (for file.close) out.put(poison); // output thread condition (for file.close)
writerResult.get(10000, TimeUnit.MILLISECONDS); writerResult.get(10000, TimeUnit.MILLISECONDS);
} }
} catch (final Exception e) { } catch (final Exception e) {
this.errorMessage = e.getMessage(); this.errorMessage = e.getMessage();
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} finally { } finally {
if(reader != null) { if(reader != null) {
try { try {
reader.close(); reader.close();
} catch (IOException e) { } catch (IOException e) {
ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage()); ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
} }
} }
try { try {
out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block
} catch (InterruptedException ex) { } } catch (InterruptedException ex) { }
@ -310,7 +311,7 @@ public class MediawikiImporter extends Thread implements Importer {
File mediawikixml; File mediawikixml;
public indexMaker(final File mediawikixml) { public indexMaker(final File mediawikixml) {
super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : ""); super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
this.mediawikixml = mediawikixml; this.mediawikixml = mediawikixml;
} }
@ -337,8 +338,8 @@ public class MediawikiImporter extends Thread implements Importer {
final PositionAwareReader in = new PositionAwareReader(dumpFile); final PositionAwareReader in = new PositionAwareReader(dumpFile);
final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile)); final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
final wikiConsumer consumer = new wikiConsumer(100, producer); final wikiConsumer consumer = new wikiConsumer(100, producer);
final ExecutorService service = Executors.newCachedThreadPool( final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex")); new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
final Future<Integer> producerResult = service.submit(consumer); final Future<Integer> producerResult = service.submit(consumer);
final Future<Integer> consumerResult = service.submit(producer); final Future<Integer> consumerResult = service.submit(producer);
service.shutdown(); service.shutdown();
@ -535,14 +536,14 @@ public class MediawikiImporter extends Thread implements Importer {
} }
public void genDocument() throws Parser.Failure { public void genDocument() throws Parser.Failure {
try { try {
this.url = new AnchorURL(this.urlStub + this.title); this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html)); final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed); this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here // the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title); this.document.setTitle(this.title);
} catch (final MalformedURLException e1) { } catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1); ConcurrentLog.logException(e1);
} }
} }
public void writeXML(final OutputStreamWriter os) throws IOException { public void writeXML(final OutputStreamWriter os) throws IOException {
this.document.writeXML(os); this.document.writeXML(os);
@ -676,9 +677,9 @@ public class MediawikiImporter extends Thread implements Importer {
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} catch (final IOException e) { } catch (final IOException e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -772,78 +773,78 @@ public class MediawikiImporter extends Thread implements Importer {
} }
public static void main(final String[] s) { public static void main(final String[] s) {
if (s.length == 0) { if (s.length == 0) {
System.out.println("usage:"); System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>"); System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>"); System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>"); System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown(); ConcurrentLog.shutdown();
return; return;
} }
try { try {
// example: // example:
// java -Xmx2000m -cp classes:lib/bzip2.jar // java -Xmx2000m -cp classes:lib/bzip2.jar
// de.anomic.tools.mediawikiIndex -convert // de.anomic.tools.mediawikiIndex -convert
// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 // DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ // DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert")) { if (s[0].equals("-convert")) {
if(s.length < 3) { if(s.length < 3) {
System.out.println("usage:"); System.out.println("usage:");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown(); ConcurrentLog.shutdown();
return; return;
} }
final File targetdir = new File(s[2]); final File targetdir = new File(s[2]);
try { try {
final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir); final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
mi.start(); mi.start();
mi.join(); mi.join();
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
if (s[0].equals("-index")) { if (s[0].equals("-index")) {
try { try {
createIndex(new File(s[1])); createIndex(new File(s[1]));
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
if (s[0].equals("-read")) { if (s[0].equals("-read")) {
final long start = Integer.parseInt(s[1]); final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]); final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len))); System.out.println(UTF8.String(read(new File(s[3]), start, len)));
} }
if (s[0].equals("-find")) { if (s[0].equals("-find")) {
try { try {
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml")); final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) { if (w == null) {
ConcurrentLog.info("WIKITRANSLATION", "not found"); ConcurrentLog.info("WIKITRANSLATION", "not found");
} else { } else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start)))); System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
} }
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
} finally { } finally {
try { try {
HTTPClient.closeConnectionManager(); HTTPClient.closeConnectionManager();
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
} }
ConcurrentLog.shutdown(); ConcurrentLog.shutdown();
} }
} }
} }

@ -37,6 +37,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
/** /**
* Base class for parsing compressed files relying on Apache commons-compress * Base class for parsing compressed files relying on Apache commons-compress
@ -44,25 +45,25 @@ import net.yacy.document.VocabularyScraper;
*/ */
public abstract class AbstractCompressorParser extends AbstractParser implements Parser { public abstract class AbstractCompressorParser extends AbstractParser implements Parser {
/** Crawl depth applied when parsing internal compressed content */ /** Crawl depth applied when parsing internal compressed content */
protected static final int DEFAULT_DEPTH = 999; protected static final int DEFAULT_DEPTH = 999;
/** /**
* @param name the human readable name of the parser * @param name the human readable name of the parser
*/ */
public AbstractCompressorParser(final String name) { public AbstractCompressorParser(final String name) {
super(name); super(name);
} }
/** /**
* @param source an open input stream on a compressed source * @param source an open input stream on a compressed source
* @return a sub class of CompressorInputStream capable of uncompressing the source * @return a sub class of CompressorInputStream capable of uncompressing the source
* on the fly * on the fly
* @throws IOException when an error occurred when trying to open the compressed * @throws IOException when an error occurred when trying to open the compressed
* stream * stream
*/ */
protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException; protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
/** /**
* Maps the given name of a compressed file to the name that the * Maps the given name of a compressed file to the name that the
* file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned. * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
@ -72,116 +73,137 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
*/ */
protected abstract String getUncompressedFilename(final String filename); protected abstract String getUncompressedFilename(final String filename);
@Override @Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, public Document[] parse(
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, final DigestURL location,
final InputStream source) throws Parser.Failure, InterruptedException { final String mimeType,
final String charset,
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE, final TagValency defaultValency,
Long.MAX_VALUE); final Set<String> valencySwitchTagNames,
} final VocabularyScraper scraper,
final int timezoneOffset,
@Override final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure { Long.MAX_VALUE);
Document maindoc; }
final CompressorInputStream compressedInStream;
try { @Override
compressedInStream = createDecompressStream(source); public Document[] parseWithLimits(
} catch (final IOException | RuntimeException e) { final DigestURL location,
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); final String mimeType,
} final String charset,
final TagValency defaultValency,
try { final Set<String> valencySwitchTagNames,
// create maindoc for this archive, register with supplied url & mime final VocabularyScraper scraper,
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this); final int timezoneOffset,
final InputStream source,
final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset, final int maxLinks,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes); final long maxBytes) throws Parser.Failure {
if (docs != null) { Document maindoc;
maindoc.addSubDocuments(docs); final CompressorInputStream compressedInStream;
if (docs.length > 0 && docs[0].isPartiallyParsed()) { try {
maindoc.setPartiallyParsed(true); compressedInStream = createDecompressStream(source);
} } catch (final IOException | RuntimeException e) {
} throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
} catch (final Parser.Failure e) { }
throw e;
} catch (final IOException | RuntimeException e) { try {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); // create maindoc for this archive, register with supplied url & mime
} maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
return new Document[] { maindoc };
} final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
/** if (docs != null) {
* Create the main parsed document for the compressed document at the given URL maindoc.addSubDocuments(docs);
* and Media type if (docs.length > 0 && docs[0].isPartiallyParsed()) {
* maindoc.setPartiallyParsed(true);
* @param location the parsed resource URL }
* @param mimeType the media type of the resource }
* @param charset the charset name if known } catch (final Parser.Failure e) {
* @param parser an instance of CompressorParser that is registered as the throw e;
* parser origin of the document } catch (final IOException | RuntimeException e) {
* @return a Document instance throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
*/ }
protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, return new Document[] { maindoc };
final AbstractCompressorParser parser) { }
final String filename = location.getFileName();
return new Document(location, mimeType, charset, parser, null, null, /**
AbstractParser * Create the main parsed document for the compressed document at the given URL
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title * and Media type
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); *
} * @param location the parsed resource URL
* @param mimeType the media type of the resource
/** * @param charset the charset name if known
* Parse content in an open stream uncompressing on the fly a compressed * @param parser an instance of CompressorParser that is registered as the
* resource. * parser origin of the document
* * @return a Document instance
* @param location the URL of the compressed resource */
* @param charset the charset name if known protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
* @param ignoreClassNames an eventual set of CSS class names whose matching final AbstractCompressorParser parser) {
* html elements content should be ignored final String filename = location.getFileName();
* @param timezoneOffset the local time zone offset return new Document(location, mimeType, charset, parser, null, null,
* @param compressedInStream an open stream uncompressing on the fly the AbstractParser
* compressed content .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
* @param maxLinks the maximum total number of links to parse and add null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
* to the result documents }
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty /**
* or null text. * Parse content in an open stream uncompressing on the fly a compressed
* @throws Parser.Failure when the parser processing failed * resource.
*/ *
protected Document[] parseCompressedInputStream(final DigestURL location, final String charset, * @param location the URL of the compressed resource
final Set<String> ignoreClassNames, final int timezoneOffset, final int depth, * @param charset the charset name if known
final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { * @param ignoreClassNames an eventual set of CSS class names whose matching
final String compressedFileName = location.getFileName(); * html elements content should be ignored
final String contentfilename = getUncompressedFilename(compressedFileName); * @param timezoneOffset the local time zone offset
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); * @param compressedInStream an open stream uncompressing on the fly the
try { * compressed content
/* * @param maxLinks the maximum total number of links to parse and add
* Use the uncompressed file name for sub parsers to not unnecessarily use again * to the result documents
* this same uncompressing parser * @param maxBytes the maximum number of content bytes to process
*/ * @return a list of documents that result from parsing the source, with empty
final String locationPath = location.getPath(); * or null text.
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) * @throws Parser.Failure when the parser processing failed
+ contentfilename; */
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), protected Document[] parseCompressedInputStream(
location.getPort(), contentPath); final DigestURL location,
final String charset,
/* final TagValency defaultValency,
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on final Set<String> valencySwitchTagNames,
* compressed content final int timezoneOffset, final int depth,
*/ final CompressorInputStream compressedInStream,
return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth, final int maxLinks,
-1, compressedInStream, maxLinks, maxBytes); final long maxBytes) throws Failure {
} catch (final MalformedURLException e) { final String compressedFileName = location.getFileName();
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); final String contentfilename = getUncompressedFilename(compressedFileName);
} final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
} try {
/*
@Override * Use the uncompressed file name for sub parsers to not unnecessarily use again
public boolean isParseWithLimitsSupported() { * this same uncompressing parser
return true; */
} final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
location.getPort(), contentPath);
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(
contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
} }

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
/** /**
@ -52,7 +53,7 @@ import net.yacy.kelondro.util.FileUtils;
* Unzips and parses the content and adds it to the created main document * Unzips and parses the content and adds it to the created main document
*/ */
public class bzipParser extends AbstractParser implements Parser { public class bzipParser extends AbstractParser implements Parser {
public bzipParser() { public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser"); super("Bzip 2 UNIX Compressed File Parser");
this.SUPPORTED_EXTENSIONS.add("bz2"); this.SUPPORTED_EXTENSIONS.add("bz2");
@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) final InputStream source)
@ -99,25 +101,25 @@ public class bzipParser extends AbstractParser implements Parser {
out = null; out = null;
} catch(Exception e) { } catch(Exception e) {
if (tempFile != null) { if (tempFile != null) {
FileUtils.deletedelete(tempFile); FileUtils.deletedelete(tempFile);
} }
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
} finally { } finally {
if(zippedContent != null) { if(zippedContent != null) {
try { try {
zippedContent.close(); zippedContent.close();
} catch (IOException ignored) { } catch (IOException ignored) {
log.warn("Could not close bzip input stream"); log.warn("Could not close bzip input stream");
} }
} }
if(out != null) { if(out != null) {
try { try {
out.close(); out.close();
} catch (IOException e) { } catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
} }
} }
} }
try { try {
// create maindoc for this bzip container, register with supplied url & mime // create maindoc for this bzip container, register with supplied url & mime
@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile); final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile);
if (docs != null) maindoc.addSubDocuments(docs); if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -140,7 +142,7 @@ public class bzipParser extends AbstractParser implements Parser {
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
return true; return true;
} }
/** /**
@ -151,9 +153,9 @@ public class bzipParser extends AbstractParser implements Parser {
* @param parser instance of bzipParser that is registered as the parser origin of the document * @param parser instance of bzipParser that is registered as the parser origin of the document
* @return a Document instance * @return a Document instance
*/ */
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) { public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
final String filename = location.getFileName(); final String filename = location.getFileName();
Document maindoc = new Document( Document maindoc = new Document(
location, location,
mimeType, mimeType,
charset, charset,
@ -172,49 +174,48 @@ public class bzipParser extends AbstractParser implements Parser {
null, null,
false, false,
new Date()); new Date());
return maindoc; return maindoc;
} }
/** /**
* Parse content in an open stream uncompressing on the fly a bzipped resource. * Parse content in an open stream uncompressing on the fly a bzipped resource.
* @param location the URL of the bzipped resource * @param location the URL of the bzipped resource
* @param charset the charset name if known * @param charset the charset name if known
* @param timezoneOffset the local time zone offset * @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content * @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks * @param maxLinks
* the maximum total number of links to parse and add to the * the maximum total number of links to parse and add to the
* result documents * result documents
* @param maxBytes * @param maxBytes
* the maximum number of content bytes to process * the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with * @return a list of documents that result from parsing the source, with
* empty or null text. * empty or null text.
* @throws Parser.Failure * @throws Parser.Failure
* when the parser processing failed * when the parser processing failed
*/ */
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName(); final String compressedFileName = location.getFileName();
final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName); final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try { try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath(); final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
} }
} }
@Override @Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure { throws Parser.Failure {
Document maindoc = null; Document maindoc = null;
BZip2CompressorInputStream zippedContent = null; BZip2CompressorInputStream zippedContent = null;
try { try {
@ -222,23 +223,23 @@ public class bzipParser extends AbstractParser implements Parser {
zippedContent = new BZip2CompressorInputStream(source); zippedContent = new BZip2CompressorInputStream(source);
} catch(Exception e) { } catch(Exception e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
} }
try { try {
// create maindoc for this bzip container, register with supplied url & mime // create maindoc for this bzip container, register with supplied url & mime
maindoc = createMainDocument(location, mimeType, charset, this); maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes); final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
if (docs != null) { if (docs != null) {
maindoc.addSubDocuments(docs); maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) { if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true); maindoc.setPartiallyParsed(true);
} }
} }
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof Parser.Failure) { if (e instanceof Parser.Failure) {
throw (Parser.Failure) e; throw (Parser.Failure) e;
} }
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location); throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
/** /**
@ -52,8 +53,8 @@ import net.yacy.kelondro.util.FileUtils;
* Unzips and parses the content and adds it to the created main document * Unzips and parses the content and adds it to the created main document
*/ */
public class gzipParser extends AbstractParser implements Parser { public class gzipParser extends AbstractParser implements Parser {
private static final int DEFAULT_DEPTH = 999; private static final int DEFAULT_DEPTH = 999;
public gzipParser() { public gzipParser() {
super("GNU Zip Compressed Archive Parser"); super("GNU Zip Compressed Archive Parser");
@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException { final InputStream source) throws Parser.Failure, InterruptedException {
@ -84,10 +86,10 @@ public class gzipParser extends AbstractParser implements Parser {
try { try {
zippedContent = new GZIPInputStream(source); zippedContent = new GZIPInputStream(source);
} catch(IOException e) { } catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */ * and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException()); new GZIPOpeningStreamException());
} }
try { try {
int read = 0; int read = 0;
@ -103,32 +105,32 @@ public class gzipParser extends AbstractParser implements Parser {
out.write(data, 0, read); out.write(data, 0, read);
} }
} catch(Exception e) { } catch(Exception e) {
if (tempFile != null) { if (tempFile != null) {
FileUtils.deletedelete(tempFile); FileUtils.deletedelete(tempFile);
} }
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
} finally { } finally {
if(zippedContent != null) { if(zippedContent != null) {
try { try {
zippedContent.close(); zippedContent.close();
} catch (IOException ignored) { } catch (IOException ignored) {
log.warn("Could not close gzip input stream"); log.warn("Could not close gzip input stream");
} }
} }
if(out != null) { if(out != null) {
try { try {
out.close(); out.close();
} catch (IOException e) { } catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
} }
} }
} }
try { try {
maindoc = createMainDocument(location, mimeType, charset, this); maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile); Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs); if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -149,96 +151,96 @@ public class gzipParser extends AbstractParser implements Parser {
* @param an instance of gzipParser that is registered as the parser origin of the document * @param an instance of gzipParser that is registered as the parser origin of the document
* @return a Document instance * @return a Document instance
*/ */
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) { public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
final String filename = location.getFileName(); final String filename = location.getFileName();
Document maindoc = new Document( Document maindoc = new Document(
location, location,
mimeType, mimeType,
charset, charset,
parser, parser,
null, null,
null, null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null,
null, null,
null, null,
null, null,
0.0d, 0.0d, 0.0d, 0.0d,
(Object) null, (Object) null,
null, null,
null, null,
null, null,
false, false,
new Date()); new Date());
return maindoc; return maindoc;
} }
/** /**
* Parse content in an open stream uncompressing on the fly a gzipped resource. * Parse content in an open stream uncompressing on the fly a gzipped resource.
* @param location the URL of the gzipped resource * @param location the URL of the gzipped resource
* @param charset the charset name if known * @param charset the charset name if known
* @param timezoneOffset the local time zone offset * @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content * @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks * @param maxLinks
* the maximum total number of links to parse and add to the * the maximum total number of links to parse and add to the
* result documents * result documents
* @param maxBytes * @param maxBytes
* the maximum number of content bytes to process * the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with * @return a list of documents that result from parsing the source, with
* empty or null text. * empty or null text.
* @throws Parser.Failure * @throws Parser.Failure
* when the parser processing failed * when the parser processing failed
*/ */
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth, public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName(); final String compressedFileName = location.getFileName();
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName); final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try { try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */ /* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath(); final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename; final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath); final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes); return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
} }
} }
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
return true; return true;
} }
@Override @Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure { throws Parser.Failure {
Document maindoc = null; Document maindoc = null;
GZIPInputStream zippedContent = null; GZIPInputStream zippedContent = null;
try { try {
/* Only use in-memory stream here (no temporary file) : the parsers /* Only use in-memory stream here (no temporary file) : the parsers
* matching compressed content are expected to handle properly the maxBytes limit and terminate * matching compressed content are expected to handle properly the maxBytes limit and terminate
* before an eventual OutOfMemory occurs */ * before an eventual OutOfMemory occurs */
zippedContent = new GZIPInputStream(source); zippedContent = new GZIPInputStream(source);
} catch(IOException e) { } catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */ * and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location, throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException()); new GZIPOpeningStreamException());
} }
try { try {
maindoc = createMainDocument(location, mimeType, charset, this); maindoc = createMainDocument(location, mimeType, charset, this);
Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes); Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
if (docs != null) { if (docs != null) {
maindoc.addSubDocuments(docs); maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) { if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true); maindoc.setPartiallyParsed(true);
} }
} }
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
@ -251,15 +253,15 @@ public class gzipParser extends AbstractParser implements Parser {
*/ */
public class GZIPOpeningStreamException extends Exception { public class GZIPOpeningStreamException extends Exception {
/** The serialization ID */ /** The serialization ID */
private static final long serialVersionUID = 2824038185373304636L; private static final long serialVersionUID = 2824038185373304636L;
public GZIPOpeningStreamException() {
super();
}
public GZIPOpeningStreamException() { public GZIPOpeningStreamException(final String message) {
super(); super(message);
} }
public GZIPOpeningStreamException(final String message) {
super(message);
}
} }
} }

@ -63,7 +63,7 @@ import net.yacy.document.parser.html.TransformerWriter;
public class htmlParser extends AbstractParser implements Parser { public class htmlParser extends AbstractParser implements Parser {
/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */ /** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
private static final int DEFAULT_MAX_LINKS = 10000; private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() { public htmlParser() {
@ -108,42 +108,93 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE); return parseWithLimits(
location,
mimeType,
documentCharset,
TagValency.EVAL,
new HashSet<String>(),
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
} }
@Override @Override
public Document[] parse( public Document[] parse(
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String documentCharset, final String documentCharset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper, final VocabularyScraper vocscraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE); return parseWithLimits(
location, mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
} }
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
return true; return true;
} }
@Override @Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, public Document[] parseWithLimits(
final Set<String> ignore_class_name, final VocabularyScraper vocscraper, final DigestURL location,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes) final String mimeType,
throws Failure { final String documentCharset,
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes); final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Failure {
return parseWithLimits(
location,
mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
maxLinks,
maxLinks,
maxBytes);
} }
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper, private Document[] parseWithLimits(
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes) final DigestURL location,
throws Failure { final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxAnchors,
final int maxLinks,
final long maxBytes)
throws Failure {
try { try {
// first get a document from the parsed html // first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes); ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag // parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null; Document documentSnapshot = null;
@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader) // and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) { if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both) } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) { if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes); documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} }
} }
} catch (Exception ex1) { // ignore any exception for any issue with snapshot } catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd; return ppd;
} }
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException { public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper,
final int timezoneOffset,
final String input,
final int maxAnchors,
final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream; InputStream sourceStream;
try { try {
@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser {
} }
ContentScraper scraper; // for this static methode no need to init local this.scraperObject ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try { try {
scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE); scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) { } catch (Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper( public static ContentScraper parseToScraper(
final DigestURL location, final DigestURL location,
final String documentCharset, final String documentCharset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
final Charset[] detectedcharsetcontainer, final Charset[] detectedcharsetcontainer,
final int timezoneOffset, final int timezoneOffset,
@ -264,7 +325,7 @@ public class htmlParser extends AbstractParser implements Parser {
final int maxAnchors, final int maxAnchors,
final int maxLinks, final int maxLinks,
final long maxBytes) throws Parser.Failure, IOException { final long maxBytes) throws Parser.Failure, IOException {
// make a scraper // make a scraper
String charset = null; String charset = null;
@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser {
htmlFilter = new ScraperInputStream( htmlFilter = new ScraperInputStream(
sourceStream, sourceStream,
documentCharset, documentCharset,
ignore_class_name, valencySwitchTagNames,
TagValency.EVAL, defaultValency,
vocabularyScraper, vocabularyScraper,
location, location,
false, false,
@ -325,26 +386,26 @@ public class htmlParser extends AbstractParser implements Parser {
location, location,
maxAnchors, maxAnchors,
maxLinks, maxLinks,
ignore_class_name, valencySwitchTagNames,
TagValency.EVAL, TagValency.EVAL,
vocabularyScraper, vocabularyScraper,
timezoneOffset); timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]); final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars); final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
if(copiedChars > maxChars) { if(copiedChars > maxChars) {
/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */ /* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true); scraper.setContentSizeLimitExceeded(true);
} else if(copiedChars == maxChars) { } else if(copiedChars == maxChars) {
/* Exactly maxChars limit reached : let's check if more to read remain. */ /* Exactly maxChars limit reached : let's check if more to read remain. */
if(sourceReader.read() >= 0) { if(sourceReader.read() >= 0) {
scraper.setContentSizeLimitExceeded(true); scraper.setContentSizeLimitExceeded(true);
} }
} }
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location); throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally { } finally {
writer.flush(); writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller) //sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -456,9 +517,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/ */
private Document parseAlternativeSnapshot( private Document parseAlternativeSnapshot(
final DigestURL location, final String mimeType, final String documentCharset, final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper, final TagValency defaultValency, final Set<String> valencySwitchTagNames,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) { final VocabularyScraper vocscraper,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null; Document documentSnapshot = null;
try { try {
// construct url for case (1) with anchor // construct url for case (1) with anchor
@ -476,17 +538,17 @@ public class htmlParser extends AbstractParser implements Parser {
Charset[] detectedcharsetcontainer = new Charset[]{null}; Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream snapshotStream = null; InputStream snapshotStream = null;
try { try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes); ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally { } finally {
if(snapshotStream != null) { if(snapshotStream != null) {
try { try {
snapshotStream.close(); snapshotStream.close();
} catch(IOException e) { } catch(IOException e) {
AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage()); AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
} }
} }
} }
AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString()); AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString());
} catch (IOException | Failure ex) { } } catch (IOException | Failure ex) { }

@ -44,6 +44,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback; import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream; import SevenZip.IInStream;
@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset, final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException { final IInStream source) throws Parser.Failure, InterruptedException {
@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location); throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
} }
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset); final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
AbstractParser.log.fine("processing archive contents..."); AbstractParser.log.fine("processing archive contents...");
try { try {
archive.Extract(null, -1, 0, aec); archive.Extract(null, -1, 0, aec);
@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset, final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException { final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source)); return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
} }
@Override @Override
@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException { final InputStream source) throws Parser.Failure, InterruptedException {
try { try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos); FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())}; return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location); throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
} }
@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null; private ByteArrayOutputStream cfos = null;
private final Document doc; private final Document doc;
private final String prefix; private final String prefix;
private Set<String> ignore_class_name; private final TagValency defaultValency;
private Set<String> valencySwitchTagNames;
private final int timezoneOffset; private final int timezoneOffset;
public SZParserExtractCallback( public SZParserExtractCallback(
@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final IInArchive handler, final IInArchive handler,
final Document doc, final Document doc,
final String prefix, final String prefix,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset) { final int timezoneOffset) {
super.Init(handler); super.Init(handler);
this.log = logger; this.log = logger;
this.doc = doc; this.doc = doc;
this.prefix = prefix; this.prefix = prefix;
this.ignore_class_name = ignore_class_name; this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames;
this.timezoneOffset = timezoneOffset; this.timezoneOffset = timezoneOffset;
} }
@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects // below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray()); theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs); this.doc.addSubDocuments(theDocs);
} }

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
// this is a new implementation of this parser idiom using multiple documents as result set // this is a new implementation of this parser idiom using multiple documents as result set
@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException { InputStream source) throws Parser.Failure, InterruptedException {
@ -104,17 +106,17 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
/* /*
* Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive. * Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive.
* We use the tar file name as the parent sub path. Example : http://host/archive.tar/name. * We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
* Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the * Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
* extension of the URL is still ".tar", thus incorrectly making the tar parser * extension of the URL is still ".tar", thus incorrectly making the tar parser
* as a possible parser for the sub resource. * as a possible parser for the sub resource.
*/ */
final DigestURL subLocation = new DigestURL(parentTarURL, name); final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp); final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp);
if (subDocs == null) { if (subDocs == null) {
continue; continue;
} }
maindoc.addSubDocuments(subDocs); maindoc.addSubDocuments(subDocs);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
@ -130,146 +132,146 @@ public class tarParser extends AbstractParser implements Parser {
return new Document[]{maindoc}; return new Document[]{maindoc};
} }
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
return true; return true;
} }
@Override @Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException { final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
final DigestURL parentTarURL = createParentTarURL(location); final DigestURL parentTarURL = createParentTarURL(location);
final TarArchiveInputStream tis = new TarArchiveInputStream(source); final TarArchiveInputStream tis = new TarArchiveInputStream(source);
// create maindoc for this tar container // create maindoc for this tar container
final Document maindoc = createMainDocument(location, mimeType, charset, this); final Document maindoc = createMainDocument(location, mimeType, charset, this);
// loop through the elements in the tar file and parse every single file inside // loop through the elements in the tar file and parse every single file inside
TarArchiveEntry entry; TarArchiveEntry entry;
int totalProcessedLinks = 0; int totalProcessedLinks = 0;
while (true) { while (true) {
try { try {
entry = tis.getNextTarEntry(); entry = tis.getNextTarEntry();
if (entry == null) { if (entry == null) {
break; break;
} }
/* /*
* We are here sure at least one entry has still to be processed : let's check * We are here sure at least one entry has still to be processed : let's check
* now the bytes limit as sub parsers applied on eventual previous entries may * now the bytes limit as sub parsers applied on eventual previous entries may
* not support partial parsing and would have thrown a Parser.Failure instead of * not support partial parsing and would have thrown a Parser.Failure instead of
* marking the document as partially parsed. * marking the document as partially parsed.
*/ */
if (tis.getBytesRead() >= maxBytes) { if (tis.getBytesRead() >= maxBytes) {
maindoc.setPartiallyParsed(true); maindoc.setPartiallyParsed(true);
break; break;
} }
if (entry.isDirectory() || entry.getSize() <= 0) { if (entry.isDirectory() || entry.getSize() <= 0) {
continue; continue;
} }
final String name = entry.getName(); final String name = entry.getName();
final int idx = name.lastIndexOf('.'); final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : ""); final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
try { try {
/* /*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content * compressed content
*/ */
/* /*
* Create an appropriate sub location to prevent unwanted fallback to the * Create an appropriate sub location to prevent unwanted fallback to the
* tarparser on resources included in the archive. We use the tar file name as * tarparser on resources included in the archive. We use the tar file name as
* the parent sub path. Example : http://host/archive.tar/name. Indeed if we * the parent sub path. Example : http://host/archive.tar/name. Indeed if we
* create a sub location with a '#' separator such as * create a sub location with a '#' separator such as
* http://host/archive.tar#name, the extension of the URL is still ".tar", thus * http://host/archive.tar#name, the extension of the URL is still ".tar", thus
* incorrectly making the tar parser as a possible parser for the sub resource. * incorrectly making the tar parser as a possible parser for the sub resource.
*/ */
final DigestURL subLocation = new DigestURL(parentTarURL, name); final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999, final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead()); entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
/* /*
* If the parser(s) did not consume all bytes in the entry, these ones will be * If the parser(s) did not consume all bytes in the entry, these ones will be
* skipped by the next call to getNextTarEntry() * skipped by the next call to getNextTarEntry()
*/ */
if (subDocs == null) { if (subDocs == null) {
continue; continue;
} }
maindoc.addSubDocuments(subDocs); maindoc.addSubDocuments(subDocs);
for (Document subDoc : subDocs) { for (Document subDoc : subDocs) {
if (subDoc.getAnchors() != null) { if (subDoc.getAnchors() != null) {
totalProcessedLinks += subDoc.getAnchors().size(); totalProcessedLinks += subDoc.getAnchors().size();
} }
} }
/* /*
* Check if a limit has been exceeded (we are sure to pass here when maxLinks * Check if a limit has been exceeded (we are sure to pass here when maxLinks
* has been exceeded as this limit require parser support for partial parsing to * has been exceeded as this limit require parser support for partial parsing to
* be detected) * be detected)
*/ */
if (subDocs[0].isPartiallyParsed()) { if (subDocs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true); maindoc.setPartiallyParsed(true);
break; break;
} }
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
} }
} catch (final IOException e) { } catch (final IOException e) {
AbstractParser.log.warn("tar parser:" + e.getMessage()); AbstractParser.log.warn("tar parser:" + e.getMessage());
break; break;
} }
} }
return new Document[] { maindoc }; return new Document[] { maindoc };
} }
/** /**
* Generate a parent URL to use for generating sub URLs on tar archive entries. * Generate a parent URL to use for generating sub URLs on tar archive entries.
* *
* @param tarURL * @param tarURL
* the URL of the tar archive * the URL of the tar archive
* @return an URL ending with a "/" suitable as a base URL for archive entries * @return an URL ending with a "/" suitable as a base URL for archive entries
*/ */
private DigestURL createParentTarURL(final DigestURL tarURL) { private DigestURL createParentTarURL(final DigestURL tarURL) {
String locationStr = tarURL.toNormalform(false); String locationStr = tarURL.toNormalform(false);
if (!locationStr.endsWith("/")) { if (!locationStr.endsWith("/")) {
locationStr += "/"; locationStr += "/";
} }
DigestURL parentTarURL; DigestURL parentTarURL;
try { try {
parentTarURL = new DigestURL(locationStr); parentTarURL = new DigestURL(locationStr);
} catch (MalformedURLException e1) { } catch (MalformedURLException e1) {
/* This should not happen */ /* This should not happen */
parentTarURL = tarURL; parentTarURL = tarURL;
} }
return parentTarURL; return parentTarURL;
} }
/** /**
* Create the main resulting parsed document for a tar container * Create the main resulting parsed document for a tar container
* *
* @param location * @param location
* the parsed resource URL * the parsed resource URL
* @param mimeType * @param mimeType
* the media type of the resource * the media type of the resource
* @param charset * @param charset
* the charset name if known * the charset name if known
* @param parser * @param parser
* instance of tarParser that is registered as the parser origin of * instance of tarParser that is registered as the parser origin of
* the document * the document
* @return a Document instance * @return a Document instance
*/ */
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final tarParser parser) { final tarParser parser) {
final String filename = location.getFileName(); final String filename = location.getFileName();
final Document maindoc = new Document(location, mimeType, charset, parser, null, null, final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
AbstractParser AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
return maindoc; return maindoc;
} }
public final static boolean isTar(File f) { public final static boolean isTar(File f) {
if (!f.exists() || f.length() < 0x105) return false; if (!f.exists() || f.length() < 0x105) return false;

@ -39,6 +39,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final Set<String> ignore_class_name, final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) final InputStream source)
@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp); final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue; if (docs == null) continue;
maindoc.addSubDocuments(docs); maindoc.addSubDocuments(docs);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -626,6 +626,7 @@ public class Crawler_p {
cachePolicy, cachePolicy,
collection, collection,
agentName, agentName,
TagValency.EVAL,
ignoreclassname, ignoreclassname,
new VocabularyScraper(vocabulary_scraper), new VocabularyScraper(vocabulary_scraper),
timezoneOffset); timezoneOffset);

@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
@ -161,7 +162,7 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
null, null, TagValency.EVAL, null, null,
timezoneOffset); timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) { } catch (final Exception e) {

@ -709,7 +709,16 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.getContentType()); final String supportError = TextParser.supports(url, responseHeader.getContentType());
if (supportError != null) throw new IOException("no parser support: " + supportError); if (supportError != null) throw new IOException("no parser support: " + supportError);
try { try {
documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent()); documents = TextParser.parseSource(
url,
responseHeader.getContentType(),
responseHeader.getCharacterEncoding(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
timezoneOffset,
response.depth(),
response.getContent());
if (documents == null) throw new IOException("document == null"); if (documents == null) throw new IOException("document == null");
} catch (final Exception e) { } catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage()); throw new IOException("parser error: " + e.getMessage());

@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch {
documents = TextParser.genericParseSource(new AnchorURL(response.url()), documents = TextParser.genericParseSource(new AnchorURL(response.url()),
response.getMimeType(), response.getMimeType(),
response.getCharacterEncoding(), response.getCharacterEncoding(),
response.profile().ignoreDivClassName(), response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(), response.profile().scraper(),
response.profile().timezoneOffset(), response.profile().timezoneOffset(),
response.depth(), response.depth(),
@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()), new AnchorURL(response.url()),
response.getMimeType(), response.getMimeType(),
response.getCharacterEncoding(), response.getCharacterEncoding(),
response.profile().ignoreDivClassName(), response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(), response.profile().scraper(),
response.profile().timezoneOffset(), response.profile().timezoneOffset(),
response.depth(), response.depth(),

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphConfiguration;
@ -162,24 +163,24 @@ public class DocumentIndex extends Segment {
} }
InputStream sourceStream = null; InputStream sourceStream = null;
try { try {
sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream); documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
} catch (final Exception e ) { } catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} finally { } finally {
if(sourceStream != null) { if(sourceStream != null) {
try { try {
sourceStream.close(); sourceStream.close();
} catch(IOException e) { } catch(IOException e) {
ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage()); ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage());
} }
} }
} }
//Document document = Document.mergeDocuments(url, null, documents); //Document document = Document.mergeDocuments(url, null, documents);
final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; final SolrInputDocument[] rows = new SolrInputDocument[documents.length];
int c = 0; int c = 0;
for ( final Document document : documents ) { for ( final Document document : documents ) {
if (document == null) continue; if (document == null) continue;
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0); final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0);
rows[c++] = rows[c++] =
super.storeDocument( super.storeDocument(

Loading…
Cancel
Save