crawl profile adoption to new tag valency attribute

pull/554/head
Michael Christen 2 years ago
parent 5acd98f4da
commit 4304e07e6f

@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
@ -60,7 +61,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
@ -75,7 +76,7 @@ public final class CrawlSwitchboard {
public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static {
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
DEFAULT_PROFILES.add(CRAWL_PROFILE_RECRAWL_JOB);
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
@ -93,11 +94,11 @@ public final class CrawlSwitchboard {
// Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */
/**
* The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
* limit can be set up by the recrawl job selection query, but a default limit
* prevent unwanted overload on targets)
*/
/**
* The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
* limit can be set up by the recrawl job selection query, but a default limit
* prevent unwanted overload on targets)
*/
public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@ -139,7 +140,7 @@ public final class CrawlSwitchboard {
try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException | SpaceExceededException | RuntimeException e ) {
ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e);
ConcurrentLog.warn("CrawlProfiles", "Could not load profile " + handle, e);
p = null;
}
if ( p == null ) {
@ -275,16 +276,15 @@ public final class CrawlSwitchboard {
public RowHandleSet getURLHashes(final byte[] profileKey) {
return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey));
}
private void initActiveCrawlProfiles() {
final Switchboard sb = Switchboard.getSwitchboard();
// generate new default entry for deep auto crawl
this.defaultAutocrawlDeepProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_DEEP,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
final Switchboard sb = Switchboard.getSwitchboard();
// generate new default entry for deep auto crawl
this.defaultAutocrawlDeepProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_DEEP,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
@ -308,12 +308,13 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
this.defaultAutocrawlDeepProfile);
// generate new default entry for shallow auto crawl
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
this.defaultAutocrawlDeepProfile);
// generate new default entry for shallow auto crawl
this.defaultAutocrawlShallowProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
@ -341,6 +342,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -364,7 +366,7 @@ public final class CrawlSwitchboard {
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true),
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
@ -373,6 +375,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -405,6 +408,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -437,6 +441,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -469,6 +474,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -509,6 +515,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -541,6 +548,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -573,6 +581,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -605,6 +614,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -640,6 +650,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);

@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL, null, null, 0);
return profile;
}

@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema;
@ -69,19 +70,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
/** Regular expression pattern matching everything */
public static final String MATCH_ALL_STRING = ".*";
/** Regular expression pattern matching nothing */
public static final String MATCH_NEVER_STRING = "";
/** Empty Solr query */
public static final String SOLR_EMPTY_QUERY = "";
/** Match all Solr query */
public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY;
/** Regular expression matching everything */
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
/** Regular expression matching nothing */
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
@ -126,14 +127,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"),
DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"),
VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
public static final int STRING = 2;
public final String key, label;
public final boolean readonly;
public final int type;
@ -143,39 +145,39 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.type = type;
this.label = label;
}
@Override
public String toString() {
return this.key;
}
}
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
/** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustMatch = null;
/** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
private Pattern crawlerOriginUrlMustNotMatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
/** Pattern on the media type documents must match before being indexed
* @see CollectionSchema#content_type */
private Pattern indexMediaTypeMustMatch = null;
/** Pattern on the media type documents must not match before being indexed
* @see CollectionSchema#content_type */
private Pattern indexMediaTypeMustNotMatch = null;
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final Set<String> ignore_class_name;
private final TagValency defaultValency;
private final Set<String> valencySwitchTagNames;
private final VocabularyScraper scraper;
/**
@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset) {
super(40);
@ -252,40 +255,42 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.NAME.key, name);
put(CrawlAttribute.AGENT_NAME.key, userAgentName);
put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(CrawlAttribute.DEPTH.key, depth);
put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL);
put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages);
put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?'
put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames
put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(CrawlAttribute.DEPTH.key, depth);
put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL);
put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages);
put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?'
put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames
put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow);
put(CrawlAttribute.INDEX_TEXT.key, indexText);
put(CrawlAttribute.INDEX_MEDIA.key, indexMedia);
put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache);
put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing);
put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth);
put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage);
put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld);
put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
put(CrawlAttribute.INDEX_TEXT.key, indexText);
put(CrawlAttribute.INDEX_MEDIA.key, indexMedia);
put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache);
put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing);
put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth);
put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage);
put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld);
put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the ignore_class_name and scraper information into a JSON Array
this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
String jsonString = new JSONArray(ignore_class_name).toString();
put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
String jsonString = new JSONArray(valencySwitchTagNames).toString();
put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
JSONArray a;
if(jsonString == null) {
if (jsonString == null) {
a = new JSONArray();
} else {
try {
@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
a = new JSONArray();
}
}
this.ignore_class_name = new HashSet<String>();
this.valencySwitchTagNames = new HashSet<String>();
for (int i = 0; i < a.length(); i++) try {
this.ignore_class_name.add(a.getString(i));
this.valencySwitchTagNames.add(a.getString(i));
} catch (JSONException e) {}
jsonString = ext.get(CrawlAttribute.SCRAPER.key);
if (jsonString == null || jsonString.length() == 0) {
@ -336,14 +343,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
public Set<String> ignoreDivClassName() {
return this.ignore_class_name;
public TagValency defaultValency() {
return this.defaultValency;
}
public Set<String> valencySwitchTagNames() {
return this.valencySwitchTagNames;
}
public VocabularyScraper scraper() {
return this.scraper;
}
public void domInc(final String domain) {
if (domain == null) return; // may be correct for file system crawls
final AtomicInteger dp = this.doms.get(domain);
@ -427,7 +438,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
//if (r == null) return null;
return r;
}
private Map<String, Pattern> cmap = null;
/**
@ -440,7 +451,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.cmap = collectionParser(r);
return this.cmap;
}
public static Map<String, Pattern> collectionParser(String collectionString) {
if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>();
String[] cs = CommonPattern.COMMA.split(collectionString);
@ -470,7 +481,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String r = get(CrawlAttribute.COLLECTIONS.key);
return r == null || r.length() == 0 || "user".equals(r) ? name() : r;
}
/**
* Gets the regex which must be matched by URLs in order to be crawled.
* @return regex which must be matched
@ -484,7 +495,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.crawlerurlmustmatch;
}
/**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs,
@ -516,7 +527,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.crawlerurlmustnotmatch;
}
/**
* Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
*
@ -538,7 +549,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.crawlerOriginUrlMustMatch;
}
/**
* Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
*
@ -601,7 +612,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (list.length == 1 && list.length == 0) list = new String[0];
return list;
}
/**
* If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0)
* @return regex which must be matched
@ -643,7 +654,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.indexurlmustnotmatch;
}
/**
* Gets the regex which must be matched by URLs in order to be indexed.
* @return regex which must be matched
@ -671,7 +682,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.indexcontentmustnotmatch;
}
/**
* Get the Pattern on media type that documents must match in order to be indexed
*
@ -693,7 +704,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.indexMediaTypeMustMatch;
}
/**
* Get the Pattern on media type that documents must not match in order to be indexed
*
@ -715,9 +726,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.indexMediaTypeMustNotMatch;
}
/**
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).
@ -743,7 +752,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
/**
* @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is
@ -772,7 +781,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public void setCacheStrategy(final CacheStrategy newStrategy) {
put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
}
/**
* Gets the minimum date that an entry must have to be re-crawled.
* @return time in ms representing a date
@ -847,13 +856,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(CrawlAttribute.REMOTE_INDEXING.key);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public int snapshotMaxdepth() {
final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key);
if (r == null) return -1;
@ -866,7 +875,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return -1;
}
}
public boolean snapshotLoadImage() {
final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key);
if (r == null) return false;
@ -878,7 +887,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public Pattern snapshotsMustnotmatch() {
if (this.snapshotsMustnotmatch == null) {
final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key);
@ -887,7 +896,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.snapshotsMustnotmatch;
}
}
public int timezoneOffset() {
final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key);
@ -898,7 +907,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0;
}
}
/**
* get a recrawl date for a given age in minutes
* @param oldTimeMinutes
@ -946,7 +955,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString();
}
public boolean isPushCrawlProfile() {
return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB);
}
@ -1008,7 +1017,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
int i = 0;
if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) {
String item;
@ -1021,7 +1030,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
}
public static void main(String[] args) {
// test to convert the key set from set to string and back
Set<String> a = new HashSet<>();

@ -48,6 +48,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard;
public class Response {
@ -853,7 +854,7 @@ public class Response {
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
// 7) local surrogates processing (can not be known here : crawl profile is required)
// 7) local surrogates processing (can not be known here : crawl profile is required)
EventOrigin processCase = EventOrigin.UNKNOWN;
// FIXME the equals seems to be incorrect: String.equals(boolean)
if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) {
@ -873,9 +874,13 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
return TextParser.parseSource(
url(), this.responseHeader == null ? null : this.responseHeader.getContentType(),
this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(),
TagValency.EVAL, new HashSet<String>(),
new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch(Parser.Failure e) {
throw e;
throw e;
} catch (final Exception e) {
return null;
}

@ -32,6 +32,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.TagValency;
public abstract class AbstractParser implements Parser {
@ -41,20 +42,20 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name;
/**
* initialize a parser with a name
* @param name
*/
public AbstractParser(final String name) {
this.name = name;
}
this.name = name;
}
/*
* The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class.
*/
@Override
public Document[] parse(
DigestURL url,
@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser {
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
return parse(url, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source);
}
@Override
@ -72,15 +73,15 @@ public abstract class AbstractParser implements Parser {
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, scraper, timezoneOffset, source);
return parse(url, mimeType, charset, scraper, timezoneOffset, source);
}
/*
* The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class.
@ -88,32 +89,33 @@ public abstract class AbstractParser implements Parser {
@Override
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source,
final int maxLinks,
final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source,
final int maxLinks,
final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
}
@Override
public Document[] parseWithLimits(
DigestURL location,
String mimeType,
String charset,
final Set<String> ignore_class_name,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
DigestURL location,
String mimeType,
String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
}
/**
* return the name of the parser
*/
@ -164,12 +166,11 @@ public abstract class AbstractParser implements Parser {
if (t != null) c.add(t);
return c;
}
@Override
public boolean isParseWithLimitsSupported() {
/* Please override on subclasses when parseWithLimits is supported */
return false;
/* Please override on subclasses when parseWithLimits is supported */
return false;
}
}

@ -28,6 +28,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.parser.html.TagValency;
public interface Parser {
@ -63,72 +64,87 @@ public interface Parser {
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;
public Document[] parse(
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
* or when maxBytes content bytes have been processed, thus potentially
* resulting in partially parsed documents (with
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(
DigestURL url,
String mimeType,
String charset,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
* or when maxBytes content bytes have been processed, thus potentially
* resulting in partially parsed documents (with
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* @return true when the parser implementation supports the
* parseWithLimits() operation.
*/
public boolean isParseWithLimitsSupported();
* @return true when the parser implementation supports the
* parseWithLimits() operation.
*/
public boolean isParseWithLimitsSupported();
// methods to that shall make it possible to put Parser objects into a hashtable

@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser;
@ -184,7 +185,8 @@ public final class TextParser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -201,7 +203,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -218,7 +220,8 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -236,7 +239,7 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs;
}
@ -248,7 +251,8 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignoreClassNames,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -261,14 +265,15 @@ public final class TextParser {
final Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
}
private static Document[] parseSource(
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -330,7 +335,7 @@ public final class TextParser {
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try {
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes);
} catch (final Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -378,11 +383,11 @@ public final class TextParser {
int maxBytesToRead = -1;
if(maxBytes < Integer.MAX_VALUE) {
/* Load at most maxBytes + 1 :
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
maxBytesToRead = (int)maxBytes + 1;
}
if(contentLength >= 0 && contentLength < maxBytesToRead) {
if (contentLength >= 0 && contentLength < maxBytesToRead) {
maxBytesToRead = (int)contentLength;
}
@ -392,16 +397,23 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs;
}
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
public static Document[] parseSource(
final DigestURL location,
String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE);
}
@ -424,10 +436,19 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
public static Document[] parseWithLimits(
final DigestURL location,
String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream,
int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@ -449,10 +470,11 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
public static Document[] parseWithLimits(
final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@ -475,7 +497,8 @@ public final class TextParser {
final String mimeType,
final Parser parser,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream,
@ -491,11 +514,11 @@ public final class TextParser {
try {
final Document[] docs;
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else {
/* Parser do not support partial parsing within limits : let's control it here*/
final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource);
}
return docs;
} catch(final Parser.Failure e) {
@ -524,7 +547,8 @@ public final class TextParser {
final String mimeType,
final Set<Parser> parsers,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -552,13 +576,13 @@ public final class TextParser {
}
try {
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else {
/* Partial parsing is not supported by this parser : check content length now */
if(sourceArray.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
}
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis);
}
} catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&

@ -68,6 +68,7 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.NamePrefixThreadFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -101,11 +102,11 @@ public class MediawikiImporter extends Thread implements Importer {
public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) {
super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.count = 0;
this.start = 0;
this.hostport = null;
@ -154,7 +155,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
@SuppressWarnings("resource")
@Override
@Override
public void run() {
this.start = System.currentTimeMillis();
final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
@ -179,8 +180,8 @@ public class MediawikiImporter extends Thread implements Importer {
boolean page = false, text = false;
String title = null;
final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
final convertConsumer[] consumers = new convertConsumer[threads];
final Future<?>[] consumerResults = (Future<?>[]) Array.newInstance(Future.class, threads);
for (int i = 0; i < threads; i++) {
@ -276,23 +277,23 @@ public class MediawikiImporter extends Thread implements Importer {
consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
}
} catch (final Exception e) {
this.errorMessage = e.getMessage();
this.errorMessage = e.getMessage();
ConcurrentLog.logException(e);
} finally {
out.put(poison); // output thread condition (for file.close)
writerResult.get(10000, TimeUnit.MILLISECONDS);
}
} catch (final Exception e) {
this.errorMessage = e.getMessage();
this.errorMessage = e.getMessage();
ConcurrentLog.logException(e);
} finally {
if(reader != null) {
if(reader != null) {
try {
reader.close();
} catch (IOException e) {
ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
}
}
reader.close();
} catch (IOException e) {
ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
}
}
try {
out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block
} catch (InterruptedException ex) { }
@ -310,7 +311,7 @@ public class MediawikiImporter extends Thread implements Importer {
File mediawikixml;
public indexMaker(final File mediawikixml) {
super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
this.mediawikixml = mediawikixml;
}
@ -337,8 +338,8 @@ public class MediawikiImporter extends Thread implements Importer {
final PositionAwareReader in = new PositionAwareReader(dumpFile);
final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
final wikiConsumer consumer = new wikiConsumer(100, producer);
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
final Future<Integer> producerResult = service.submit(consumer);
final Future<Integer> consumerResult = service.submit(producer);
service.shutdown();
@ -535,14 +536,14 @@ public class MediawikiImporter extends Thread implements Importer {
}
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
}
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
}
}
public void writeXML(final OutputStreamWriter os) throws IOException {
this.document.writeXML(os);
@ -676,9 +677,9 @@ public class MediawikiImporter extends Thread implements Importer {
} catch (final Parser.Failure e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
// TODO Auto-generated catch block
// TODO Auto-generated catch block
ConcurrentLog.logException(e);
}
}
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
@ -772,78 +773,78 @@ public class MediawikiImporter extends Thread implements Importer {
}
public static void main(final String[] s) {
if (s.length == 0) {
System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
try {
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar
// de.anomic.tools.mediawikiIndex -convert
// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert")) {
if(s.length < 3) {
System.out.println("usage:");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
final File targetdir = new File(s[2]);
try {
final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
mi.start();
mi.join();
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-index")) {
try {
createIndex(new File(s[1]));
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-read")) {
final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len)));
}
if (s[0].equals("-find")) {
try {
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) {
ConcurrentLog.info("WIKITRANSLATION", "not found");
} else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
} finally {
try {
HTTPClient.closeConnectionManager();
} catch (InterruptedException e) {
e.printStackTrace();
}
ConcurrentLog.shutdown();
}
}
public static void main(final String[] s) {
if (s.length == 0) {
System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
try {
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar
// de.anomic.tools.mediawikiIndex -convert
// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert")) {
if(s.length < 3) {
System.out.println("usage:");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
final File targetdir = new File(s[2]);
try {
final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
mi.start();
mi.join();
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-index")) {
try {
createIndex(new File(s[1]));
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-read")) {
final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len)));
}
if (s[0].equals("-find")) {
try {
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) {
ConcurrentLog.info("WIKITRANSLATION", "not found");
} else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
} finally {
try {
HTTPClient.closeConnectionManager();
} catch (InterruptedException e) {
e.printStackTrace();
}
ConcurrentLog.shutdown();
}
}
}

@ -37,6 +37,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
/**
* Base class for parsing compressed files relying on Apache commons-compress
@ -44,25 +45,25 @@ import net.yacy.document.VocabularyScraper;
*/
public abstract class AbstractCompressorParser extends AbstractParser implements Parser {
/** Crawl depth applied when parsing internal compressed content */
protected static final int DEFAULT_DEPTH = 999;
/**
* @param name the human readable name of the parser
*/
public AbstractCompressorParser(final String name) {
super(name);
}
/**
* @param source an open input stream on a compressed source
* @return a sub class of CompressorInputStream capable of uncompressing the source
* on the fly
* @throws IOException when an error occurred when trying to open the compressed
* stream
*/
protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
/** Crawl depth applied when parsing internal compressed content */
protected static final int DEFAULT_DEPTH = 999;
/**
* @param name the human readable name of the parser
*/
public AbstractCompressorParser(final String name) {
super(name);
}
/**
* @param source an open input stream on a compressed source
* @return a sub class of CompressorInputStream capable of uncompressing the source
* on the fly
* @throws IOException when an error occurred when trying to open the compressed
* stream
*/
protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
/**
* Maps the given name of a compressed file to the name that the
* file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
@ -72,116 +73,137 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
*/
protected abstract String getUncompressedFilename(final String filename);
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
Long.MAX_VALUE);
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
Document maindoc;
final CompressorInputStream compressedInStream;
try {
compressedInStream = createDecompressStream(source);
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
try {
// create maindoc for this archive, register with supplied url & mime
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if (docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Parser.Failure e) {
throw e;
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
return new Document[] { maindoc };
}
/**
* Create the main parsed document for the compressed document at the given URL
* and Media type
*
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param parser an instance of CompressorParser that is registered as the
* parser origin of the document
* @return a Document instance
*/
protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final AbstractCompressorParser parser) {
final String filename = location.getFileName();
return new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
}
/**
* Parse content in an open stream uncompressing on the fly a compressed
* resource.
*
* @param location the URL of the compressed resource
* @param charset the charset name if known
* @param ignoreClassNames an eventual set of CSS class names whose matching
* html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the
* compressed content
* @param maxLinks the maximum total number of links to parse and add
* to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty
* or null text.
* @throws Parser.Failure when the parser processing failed
*/
protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
final String compressedFileName = location.getFileName();
final String contentfilename = getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/*
* Use the uncompressed file name for sub parsers to not unnecessarily use again
* this same uncompressing parser
*/
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
location.getPort(), contentPath);
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
Long.MAX_VALUE);
}
@Override
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source,
final int maxLinks,
final long maxBytes) throws Parser.Failure {
Document maindoc;
final CompressorInputStream compressedInStream;
try {
compressedInStream = createDecompressStream(source);
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
try {
// create maindoc for this archive, register with supplied url & mime
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if (docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Parser.Failure e) {
throw e;
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
return new Document[] { maindoc };
}
/**
* Create the main parsed document for the compressed document at the given URL
* and Media type
*
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param parser an instance of CompressorParser that is registered as the
* parser origin of the document
* @return a Document instance
*/
protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final AbstractCompressorParser parser) {
final String filename = location.getFileName();
return new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
}
/**
* Parse content in an open stream uncompressing on the fly a compressed
* resource.
*
* @param location the URL of the compressed resource
* @param charset the charset name if known
* @param ignoreClassNames an eventual set of CSS class names whose matching
* html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the
* compressed content
* @param maxLinks the maximum total number of links to parse and add
* to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty
* or null text.
* @throws Parser.Failure when the parser processing failed
*/
protected Document[] parseCompressedInputStream(
final DigestURL location,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream,
final int maxLinks,
final long maxBytes) throws Failure {
final String compressedFileName = location.getFileName();
final String contentfilename = getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/*
* Use the uncompressed file name for sub parsers to not unnecessarily use again
* this same uncompressing parser
*/
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
location.getPort(), contentPath);
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(
contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
}

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
/**
@ -52,7 +53,7 @@ import net.yacy.kelondro.util.FileUtils;
* Unzips and parses the content and adds it to the created main document
*/
public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
this.SUPPORTED_EXTENSIONS.add("bz2");
@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -99,25 +101,25 @@ public class bzipParser extends AbstractParser implements Parser {
out = null;
} catch(Exception e) {
if (tempFile != null) {
FileUtils.deletedelete(tempFile);
}
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
if (tempFile != null) {
FileUtils.deletedelete(tempFile);
}
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
} finally {
if(zippedContent != null) {
try {
zippedContent.close();
} catch (IOException ignored) {
log.warn("Could not close bzip input stream");
}
}
if(out != null) {
try {
out.close();
} catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
}
}
if(zippedContent != null) {
try {
zippedContent.close();
} catch (IOException ignored) {
log.warn("Could not close bzip input stream");
}
}
if(out != null) {
try {
out.close();
} catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
}
}
}
try {
// create maindoc for this bzip container, register with supplied url & mime
@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -140,7 +142,7 @@ public class bzipParser extends AbstractParser implements Parser {
@Override
public boolean isParseWithLimitsSupported() {
return true;
return true;
}
/**
@ -151,9 +153,9 @@ public class bzipParser extends AbstractParser implements Parser {
* @param parser instance of bzipParser that is registered as the parser origin of the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
location,
mimeType,
charset,
@ -172,49 +174,48 @@ public class bzipParser extends AbstractParser implements Parser {
null,
false,
new Date());
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a bzipped resource.
* @param location the URL of the bzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a bzipped resource.
* @param location the URL of the bzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName();
final String compressedFileName = location.getFileName();
final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure {
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure {
Document maindoc = null;
BZip2CompressorInputStream zippedContent = null;
try {
@ -222,23 +223,23 @@ public class bzipParser extends AbstractParser implements Parser {
zippedContent = new BZip2CompressorInputStream(source);
} catch(Exception e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
}
try {
// create maindoc for this bzip container, register with supplied url & mime
maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Exception e) {
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
throw (Parser.Failure) e;
}
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
/**
@ -52,8 +53,8 @@ import net.yacy.kelondro.util.FileUtils;
* Unzips and parses the content and adds it to the created main document
*/
public class gzipParser extends AbstractParser implements Parser {
private static final int DEFAULT_DEPTH = 999;
private static final int DEFAULT_DEPTH = 999;
public gzipParser() {
super("GNU Zip Compressed Archive Parser");
@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
@ -84,10 +86,10 @@ public class gzipParser extends AbstractParser implements Parser {
try {
zippedContent = new GZIPInputStream(source);
} catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
}
try {
int read = 0;
@ -103,32 +105,32 @@ public class gzipParser extends AbstractParser implements Parser {
out.write(data, 0, read);
}
} catch(Exception e) {
if (tempFile != null) {
FileUtils.deletedelete(tempFile);
}
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
if (tempFile != null) {
FileUtils.deletedelete(tempFile);
}
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
} finally {
if(zippedContent != null) {
try {
zippedContent.close();
} catch (IOException ignored) {
log.warn("Could not close gzip input stream");
}
}
if(out != null) {
try {
out.close();
} catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
if(zippedContent != null) {
try {
zippedContent.close();
} catch (IOException ignored) {
log.warn("Could not close gzip input stream");
}
}
if(out != null) {
try {
out.close();
} catch (IOException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
}
try {
maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -149,96 +151,96 @@ public class gzipParser extends AbstractParser implements Parser {
* @param an instance of gzipParser that is registered as the parser origin of the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
location,
mimeType,
charset,
parser,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a gzipped resource.
* @param location the URL of the gzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
location,
mimeType,
charset,
parser,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a gzipped resource.
* @param location the URL of the gzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName();
final String compressedFileName = location.getFileName();
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure {
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure {
Document maindoc = null;
GZIPInputStream zippedContent = null;
try {
/* Only use in-memory stream here (no temporary file) : the parsers
* matching compressed content are expected to handle properly the maxBytes limit and terminate
* before an eventual OutOfMemory occurs */
/* Only use in-memory stream here (no temporary file) : the parsers
* matching compressed content are expected to handle properly the maxBytes limit and terminate
* before an eventual OutOfMemory occurs */
zippedContent = new GZIPInputStream(source);
} catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
}
try {
maindoc = createMainDocument(location, mimeType, charset, this);
Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
@ -251,15 +253,15 @@ public class gzipParser extends AbstractParser implements Parser {
*/
public class GZIPOpeningStreamException extends Exception {
/** The serialization ID */
private static final long serialVersionUID = 2824038185373304636L;
/** The serialization ID */
private static final long serialVersionUID = 2824038185373304636L;
public GZIPOpeningStreamException() {
super();
}
public GZIPOpeningStreamException() {
super();
}
public GZIPOpeningStreamException(final String message) {
super(message);
}
public GZIPOpeningStreamException(final String message) {
super(message);
}
}
}

@ -63,7 +63,7 @@ import net.yacy.document.parser.html.TransformerWriter;
public class htmlParser extends AbstractParser implements Parser {
/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
/** The default maximum number of links (other than a, area, and canonical and stylesheet links) to add to a parsed document */
private static final int DEFAULT_MAX_LINKS = 10000;
public htmlParser() {
@ -108,42 +108,93 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(
location,
mimeType,
documentCharset,
TagValency.EVAL,
new HashSet<String>(),
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String documentCharset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(
location, mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Failure {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Failure {
return parseWithLimits(
location,
mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
maxLinks,
maxLinks,
maxBytes);
}
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
throws Failure {
private Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxAnchors,
final int maxLinks,
final long maxBytes)
throws Failure {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
}
}
} catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper,
final int timezoneOffset,
final String input,
final int maxAnchors,
final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper,
final Charset[] detectedcharsetcontainer,
final int timezoneOffset,
@ -264,7 +325,7 @@ public class htmlParser extends AbstractParser implements Parser {
final int maxAnchors,
final int maxLinks,
final long maxBytes) throws Parser.Failure, IOException {
// make a scraper
String charset = null;
@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser {
htmlFilter = new ScraperInputStream(
sourceStream,
documentCharset,
ignore_class_name,
TagValency.EVAL,
valencySwitchTagNames,
defaultValency,
vocabularyScraper,
location,
false,
@ -325,26 +386,26 @@ public class htmlParser extends AbstractParser implements Parser {
location,
maxAnchors,
maxLinks,
ignore_class_name,
valencySwitchTagNames,
TagValency.EVAL,
vocabularyScraper,
timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
final Reader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
final long copiedChars = IOUtils.copyLarge(sourceReader, writer, 0, maxChars);
if(copiedChars > maxChars) {
/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
/* maxChars limit has been exceeded : do not fail here as we want to use the partially obtained results. */
scraper.setContentSizeLimitExceeded(true);
} else if(copiedChars == maxChars) {
/* Exactly maxChars limit reached : let's check if more to read remain. */
if(sourceReader.read() >= 0) {
scraper.setContentSizeLimitExceeded(true);
}
/* Exactly maxChars limit reached : let's check if more to read remain. */
if(sourceReader.read() >= 0) {
scraper.setContentSizeLimitExceeded(true);
}
}
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
writer.flush();
//sourceStream.close(); keep open for multipe parsing (close done by caller)
@ -456,9 +517,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/
private Document parseAlternativeSnapshot(
final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
final DigestURL location, final String mimeType, final String documentCharset,
final TagValency defaultValency, final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null;
try {
// construct url for case (1) with anchor
@ -476,17 +538,17 @@ public class htmlParser extends AbstractParser implements Parser {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream snapshotStream = null;
try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally {
if(snapshotStream != null) {
try {
snapshotStream.close();
} catch(IOException e) {
AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
}
}
if(snapshotStream != null) {
try {
snapshotStream.close();
} catch(IOException e) {
AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
}
}
}
AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString());
} catch (IOException | Failure ex) { }

@ -44,6 +44,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
}
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
AbstractParser.log.fine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
}
@Override
@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
}
@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
private Set<String> ignore_class_name;
private final TagValency defaultValency;
private Set<String> valencySwitchTagNames;
private final int timezoneOffset;
public SZParserExtractCallback(
@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final IInArchive handler,
final Document doc,
final String prefix,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset) {
super.Init(handler);
this.log = logger;
this.doc = doc;
this.prefix = prefix;
this.ignore_class_name = ignore_class_name;
this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames;
this.timezoneOffset = timezoneOffset;
}
@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
// this is a new implementation of this parser idiom using multiple documents as result set
@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException {
@ -104,17 +106,17 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
/*
* Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive.
* We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
* Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
* extension of the URL is still ".tar", thus incorrectly making the tar parser
* as a possible parser for the sub resource.
*/
/*
* Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive.
* We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
* Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
* extension of the URL is still ".tar", thus incorrectly making the tar parser
* as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp);
if (subDocs == null) {
continue;
continue;
}
maindoc.addSubDocuments(subDocs);
} catch (final Parser.Failure e) {
@ -130,146 +132,146 @@ public class tarParser extends AbstractParser implements Parser {
return new Document[]{maindoc};
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
final DigestURL parentTarURL = createParentTarURL(location);
final DigestURL parentTarURL = createParentTarURL(location);
final TarArchiveInputStream tis = new TarArchiveInputStream(source);
final TarArchiveInputStream tis = new TarArchiveInputStream(source);
// create maindoc for this tar container
final Document maindoc = createMainDocument(location, mimeType, charset, this);
// create maindoc for this tar container
final Document maindoc = createMainDocument(location, mimeType, charset, this);
// loop through the elements in the tar file and parse every single file inside
TarArchiveEntry entry;
int totalProcessedLinks = 0;
while (true) {
try {
entry = tis.getNextTarEntry();
if (entry == null) {
break;
}
// loop through the elements in the tar file and parse every single file inside
TarArchiveEntry entry;
int totalProcessedLinks = 0;
while (true) {
try {
entry = tis.getNextTarEntry();
if (entry == null) {
break;
}
/*
* We are here sure at least one entry has still to be processed : let's check
* now the bytes limit as sub parsers applied on eventual previous entries may
* not support partial parsing and would have thrown a Parser.Failure instead of
* marking the document as partially parsed.
*/
if (tis.getBytesRead() >= maxBytes) {
maindoc.setPartiallyParsed(true);
break;
}
/*
* We are here sure at least one entry has still to be processed : let's check
* now the bytes limit as sub parsers applied on eventual previous entries may
* not support partial parsing and would have thrown a Parser.Failure instead of
* marking the document as partially parsed.
*/
if (tis.getBytesRead() >= maxBytes) {
maindoc.setPartiallyParsed(true);
break;
}
if (entry.isDirectory() || entry.getSize() <= 0) {
continue;
}
final String name = entry.getName();
final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
try {
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
if (entry.isDirectory() || entry.getSize() <= 0) {
continue;
}
final String name = entry.getName();
final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
try {
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
/*
* Create an appropriate sub location to prevent unwanted fallback to the
* tarparser on resources included in the archive. We use the tar file name as
* the parent sub path. Example : http://host/archive.tar/name. Indeed if we
* create a sub location with a '#' separator such as
* http://host/archive.tar#name, the extension of the URL is still ".tar", thus
* incorrectly making the tar parser as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
/*
* Create an appropriate sub location to prevent unwanted fallback to the
* tarparser on resources included in the archive. We use the tar file name as
* the parent sub path. Example : http://host/archive.tar/name. Indeed if we
* create a sub location with a '#' separator such as
* http://host/archive.tar#name, the extension of the URL is still ".tar", thus
* incorrectly making the tar parser as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
/*
* If the parser(s) did not consume all bytes in the entry, these ones will be
* skipped by the next call to getNextTarEntry()
*/
if (subDocs == null) {
continue;
}
maindoc.addSubDocuments(subDocs);
for (Document subDoc : subDocs) {
if (subDoc.getAnchors() != null) {
totalProcessedLinks += subDoc.getAnchors().size();
}
}
/*
* Check if a limit has been exceeded (we are sure to pass here when maxLinks
* has been exceeded as this limit require parser support for partial parsing to
* be detected)
*/
if (subDocs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
break;
}
} catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
}
} catch (final IOException e) {
AbstractParser.log.warn("tar parser:" + e.getMessage());
break;
}
}
return new Document[] { maindoc };
}
/*
* If the parser(s) did not consume all bytes in the entry, these ones will be
* skipped by the next call to getNextTarEntry()
*/
if (subDocs == null) {
continue;
}
maindoc.addSubDocuments(subDocs);
for (Document subDoc : subDocs) {
if (subDoc.getAnchors() != null) {
totalProcessedLinks += subDoc.getAnchors().size();
}
}
/*
* Check if a limit has been exceeded (we are sure to pass here when maxLinks
* has been exceeded as this limit require parser support for partial parsing to
* be detected)
*/
if (subDocs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
break;
}
} catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
}
} catch (final IOException e) {
AbstractParser.log.warn("tar parser:" + e.getMessage());
break;
}
}
return new Document[] { maindoc };
}
/**
* Generate a parent URL to use for generating sub URLs on tar archive entries.
*
* @param tarURL
* the URL of the tar archive
* @return an URL ending with a "/" suitable as a base URL for archive entries
*/
private DigestURL createParentTarURL(final DigestURL tarURL) {
String locationStr = tarURL.toNormalform(false);
if (!locationStr.endsWith("/")) {
locationStr += "/";
}
DigestURL parentTarURL;
try {
parentTarURL = new DigestURL(locationStr);
} catch (MalformedURLException e1) {
/* This should not happen */
parentTarURL = tarURL;
}
return parentTarURL;
}
/**
* Generate a parent URL to use for generating sub URLs on tar archive entries.
*
* @param tarURL
* the URL of the tar archive
* @return an URL ending with a "/" suitable as a base URL for archive entries
*/
private DigestURL createParentTarURL(final DigestURL tarURL) {
String locationStr = tarURL.toNormalform(false);
if (!locationStr.endsWith("/")) {
locationStr += "/";
}
DigestURL parentTarURL;
try {
parentTarURL = new DigestURL(locationStr);
} catch (MalformedURLException e1) {
/* This should not happen */
parentTarURL = tarURL;
}
return parentTarURL;
}
/**
* Create the main resulting parsed document for a tar container
*
* @param location
* the parsed resource URL
* @param mimeType
* the media type of the resource
* @param charset
* the charset name if known
* @param parser
* instance of tarParser that is registered as the parser origin of
* the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final tarParser parser) {
final String filename = location.getFileName();
final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
return maindoc;
}
/**
* Create the main resulting parsed document for a tar container
*
* @param location
* the parsed resource URL
* @param mimeType
* the media type of the resource
* @param charset
* the charset name if known
* @param parser
* instance of tarParser that is registered as the parser origin of
* the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final tarParser parser) {
final String filename = location.getFileName();
final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
return maindoc;
}
public final static boolean isTar(File f) {
if (!f.exists() || f.length() < 0x105) return false;

@ -39,6 +39,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue;
maindoc.addSubDocuments(docs);
} catch (final Parser.Failure e) {

@ -626,6 +626,7 @@ public class Crawler_p {
cachePolicy,
collection,
agentName,
TagValency.EVAL,
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);

@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
@ -161,7 +162,7 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null, null,
TagValency.EVAL, null, null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {

@ -709,7 +709,16 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.getContentType());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
documents = TextParser.parseSource(
url,
responseHeader.getContentType(),
responseHeader.getCharacterEncoding(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
timezoneOffset,
response.depth(),
response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch {
documents = TextParser.genericParseSource(new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.WebgraphConfiguration;
@ -162,24 +163,24 @@ public class DocumentIndex extends Segment {
}
InputStream sourceStream = null;
try {
sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} finally {
if(sourceStream != null) {
try {
sourceStream.close();
} catch(IOException e) {
ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage());
}
}
if(sourceStream != null) {
try {
sourceStream.close();
} catch(IOException e) {
ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage());
}
}
}
//Document document = Document.mergeDocuments(url, null, documents);
final SolrInputDocument[] rows = new SolrInputDocument[documents.length];
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
if (document == null) continue;
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true, 0);
rows[c++] =
super.storeDocument(

Loading…
Cancel
Save