crawl profile adoption to new tag valency attribute

pull/554/head
Michael Christen 2 years ago
parent 5acd98f4da
commit 4304e07e6f

@ -51,6 +51,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
@ -276,7 +277,6 @@ public final class CrawlSwitchboard {
return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey));
}
private void initActiveCrawlProfiles() {
final Switchboard sb = Switchboard.getSwitchboard();
@ -308,6 +308,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -341,6 +342,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -373,6 +375,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -405,6 +408,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -437,6 +441,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -469,6 +474,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -509,6 +515,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -541,6 +548,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -573,6 +581,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -605,6 +614,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(
@ -640,6 +650,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
TagValency.EVAL,
null, null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);

@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
@ -355,7 +356,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
ClientIdentification.yacyInternetCrawlerAgentName,
TagValency.EVAL, null, null, 0);
return profile;
}

@ -55,6 +55,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema;
@ -126,7 +127,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"),
DEFAULT_VALENCY ("default_valency", false, CrawlAttribute.STRING, "default tag valency"),
VALENCY_SWITCH_TAG_NAME ("valency_switch_tag_name", false, CrawlAttribute.STRING, "DIV Class names when default valency shall be switched"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
@ -150,7 +152,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
/** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
@ -175,7 +176,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final Set<String> ignore_class_name;
private final TagValency defaultValency;
private final Set<String> valencySwitchTagNames;
private final VocabularyScraper scraper;
/**
@ -238,7 +240,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset) {
super(40);
@ -283,9 +286,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the ignore_class_name and scraper information into a JSON Array
this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
String jsonString = new JSONArray(ignore_class_name).toString();
put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames == null ? new HashSet<String>() : valencySwitchTagNames;
String jsonString = new JSONArray(valencySwitchTagNames).toString();
put(CrawlAttribute.DEFAULT_VALENCY.key, defaultValency.name());
put(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
@ -305,9 +310,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
String defaultValency = ext.get(CrawlAttribute.DEFAULT_VALENCY.key);
this.defaultValency = defaultValency == null || defaultValency.length() == 0 ? TagValency.EVAL : TagValency.valueOf(defaultValency);
String jsonString = ext.get(CrawlAttribute.VALENCY_SWITCH_TAG_NAME.key);
JSONArray a;
if(jsonString == null) {
if (jsonString == null) {
a = new JSONArray();
} else {
try {
@ -317,9 +324,9 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
a = new JSONArray();
}
}
this.ignore_class_name = new HashSet<String>();
this.valencySwitchTagNames = new HashSet<String>();
for (int i = 0; i < a.length(); i++) try {
this.ignore_class_name.add(a.getString(i));
this.valencySwitchTagNames.add(a.getString(i));
} catch (JSONException e) {}
jsonString = ext.get(CrawlAttribute.SCRAPER.key);
if (jsonString == null || jsonString.length() == 0) {
@ -336,8 +343,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
public Set<String> ignoreDivClassName() {
return this.ignore_class_name;
public TagValency defaultValency() {
return this.defaultValency;
}
public Set<String> valencySwitchTagNames() {
return this.valencySwitchTagNames;
}
public VocabularyScraper scraper() {
@ -716,8 +727,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return this.indexMediaTypeMustNotMatch;
}
/**
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).

@ -48,6 +48,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard;
public class Response {
@ -873,7 +874,11 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
return TextParser.parseSource(
url(), this.responseHeader == null ? null : this.responseHeader.getContentType(),
this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(),
TagValency.EVAL, new HashSet<String>(),
new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {

@ -32,6 +32,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.TagValency;
public abstract class AbstractParser implements Parser {
@ -64,7 +65,7 @@ public abstract class AbstractParser implements Parser {
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
return parse(url, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source);
}
@Override
@ -72,7 +73,8 @@ public abstract class AbstractParser implements Parser {
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
@ -80,7 +82,6 @@ public abstract class AbstractParser implements Parser {
return parse(url, mimeType, charset, scraper, timezoneOffset, source);
}
/*
* The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class.
@ -96,7 +97,7 @@ public abstract class AbstractParser implements Parser {
final InputStream source,
final int maxLinks,
final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
return parseWithLimits(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
}
@Override
@ -104,7 +105,8 @@ public abstract class AbstractParser implements Parser {
DigestURL location,
String mimeType,
String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
@ -171,5 +173,4 @@ public abstract class AbstractParser implements Parser {
return false;
}
}

@ -28,6 +28,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.parser.html.TagValency;
public interface Parser {
@ -68,7 +69,8 @@ public interface Parser {
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
@ -113,15 +115,29 @@ public interface Parser {
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
public Document[] parseWithLimits(
DigestURL url,
String mimeType,
String charset,
VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**

@ -51,6 +51,7 @@ import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser;
@ -184,7 +185,8 @@ public final class TextParser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -201,7 +203,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -218,7 +220,8 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -236,7 +239,7 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs;
}
@ -248,7 +251,8 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignoreClassNames,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -261,14 +265,15 @@ public final class TextParser {
final Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
}
private static Document[] parseSource(
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -330,7 +335,7 @@ public final class TextParser {
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try {
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
return parseSource(location, mimeType, parser, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes);
} catch (final Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -382,7 +387,7 @@ public final class TextParser {
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
maxBytesToRead = (int)maxBytes + 1;
}
if(contentLength >= 0 && contentLength < maxBytesToRead) {
if (contentLength >= 0 && contentLength < maxBytesToRead) {
maxBytesToRead = (int)contentLength;
}
@ -392,16 +397,23 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
final Document[] docs = parseSource(location, mimeType, idioms, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs;
}
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
public static Document[] parseSource(
final DigestURL location,
String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE);
}
@ -424,10 +436,19 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
public static Document[] parseWithLimits(
final DigestURL location,
String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream,
int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@ -449,10 +470,11 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
public static Document[] parseWithLimits(
final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@ -475,7 +497,8 @@ public final class TextParser {
final String mimeType,
final Parser parser,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream,
@ -491,11 +514,11 @@ public final class TextParser {
try {
final Document[] docs;
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else {
/* Parser do not support partial parsing within limits : let's control it here*/
final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, limitedSource);
}
return docs;
} catch(final Parser.Failure e) {
@ -524,7 +547,8 @@ public final class TextParser {
final String mimeType,
final Set<Parser> parsers,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -552,13 +576,13 @@ public final class TextParser {
}
try {
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else {
/* Partial parsing is not supported by this parser : check content length now */
if(sourceArray.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
}
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
docs = parser.parse(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, bis);
}
} catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&

@ -68,6 +68,7 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.NamePrefixThreadFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -536,7 +537,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);

@ -37,6 +37,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
/**
* Base class for parsing compressed files relying on Apache commons-compress
@ -73,8 +74,14 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
protected abstract String getUncompressedFilename(final String filename);
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
@ -82,9 +89,17 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source,
final int maxLinks,
final long maxBytes) throws Parser.Failure {
Document maindoc;
final CompressorInputStream compressedInStream;
try {
@ -97,7 +112,7 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
// create maindoc for this archive, register with supplied url & mime
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
final Document[] docs = this.parseCompressedInputStream(location, null, defaultValency, valencySwitchTagNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
@ -151,9 +166,15 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
* or null text.
* @throws Parser.Failure when the parser processing failed
*/
protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
protected Document[] parseCompressedInputStream(
final DigestURL location,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream,
final int maxLinks,
final long maxBytes) throws Failure {
final String compressedFileName = location.getFileName();
final String contentfilename = getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
@ -172,7 +193,8 @@ public abstract class AbstractCompressorParser extends AbstractParser implements
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
return TextParser.parseWithLimits(
contentLocation, mime, charset, defaultValency, valencySwitchTagNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
/**
@ -70,7 +71,8 @@ public class bzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -125,7 +127,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
final Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -210,7 +212,6 @@ public class bzipParser extends AbstractParser implements Parser {
}
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
/**
@ -72,7 +73,8 @@ public class gzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
@ -128,7 +130,7 @@ public class gzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
Document[] docs = TextParser.parseSource(location, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -108,7 +108,18 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(
location,
mimeType,
documentCharset,
TagValency.EVAL,
new HashSet<String>(),
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
}
@Override
@ -116,12 +127,23 @@ public class htmlParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String documentCharset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(
location, mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
Integer.MAX_VALUE,
DEFAULT_MAX_LINKS,
Long.MAX_VALUE);
}
@Override
@ -130,20 +152,49 @@ public class htmlParser extends AbstractParser implements Parser {
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxLinks,
final long maxBytes)
throws Failure {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
return parseWithLimits(
location,
mimeType,
documentCharset,
defaultValency,
valencySwitchTagNames,
vocscraper,
timezoneOffset,
sourceStream,
maxLinks,
maxLinks,
maxBytes);
}
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
private Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream,
final int maxAnchors,
final int maxLinks,
final long maxBytes)
throws Failure {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
ContentScraper scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -152,10 +203,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
}
}
} catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -221,7 +272,16 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper,
final int timezoneOffset,
final String input,
final int maxAnchors,
final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -231,7 +291,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
scraper = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -256,7 +316,8 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper vocabularyScraper,
final Charset[] detectedcharsetcontainer,
final int timezoneOffset,
@ -280,8 +341,8 @@ public class htmlParser extends AbstractParser implements Parser {
htmlFilter = new ScraperInputStream(
sourceStream,
documentCharset,
ignore_class_name,
TagValency.EVAL,
valencySwitchTagNames,
defaultValency,
vocabularyScraper,
location,
false,
@ -325,7 +386,7 @@ public class htmlParser extends AbstractParser implements Parser {
location,
maxAnchors,
maxLinks,
ignore_class_name,
valencySwitchTagNames,
TagValency.EVAL,
vocabularyScraper,
timezoneOffset);
@ -457,7 +518,8 @@ public class htmlParser extends AbstractParser implements Parser {
*/
private Document parseAlternativeSnapshot(
final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final TagValency defaultValency, final Set<String> valencySwitchTagNames,
final VocabularyScraper vocscraper,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null;
try {
@ -477,7 +539,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null;
try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally {
if(snapshotStream != null) {

@ -44,6 +44,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
@ -63,7 +64,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
@ -94,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
}
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
AbstractParser.log.fine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
@ -116,10 +118,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
}
@Override
@ -127,14 +130,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
}
@ -148,7 +152,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
private Set<String> ignore_class_name;
private final TagValency defaultValency;
private Set<String> valencySwitchTagNames;
private final int timezoneOffset;
public SZParserExtractCallback(
@ -156,13 +161,15 @@ public class sevenzipParser extends AbstractParser implements Parser {
final IInArchive handler,
final Document doc,
final String prefix,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset) {
super.Init(handler);
this.log = logger;
this.doc = doc;
this.prefix = prefix;
this.ignore_class_name = ignore_class_name;
this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames;
this.timezoneOffset = timezoneOffset;
}
@ -205,7 +212,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
// this is a new implementation of this parser idiom using multiple documents as result set
@ -70,7 +71,8 @@ public class tarParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException {
@ -104,7 +106,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
/*
/*
* Create an appropriate sub location to prevent unwanted fallback to the tarparser on resources included in the archive.
* We use the tar file name as the parent sub path. Example : http://host/archive.tar/name.
* Indeed if we create a sub location with a '#' separator such as http://host/archive.tar#name, the
@ -112,7 +114,7 @@ public class tarParser extends AbstractParser implements Parser {
* as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset,999, tmp);
if (subDocs == null) {
continue;
}
@ -130,57 +132,57 @@ public class tarParser extends AbstractParser implements Parser {
return new Document[]{maindoc};
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final InputStream source, final int maxLinks,
final long maxBytes) throws Failure, InterruptedException, UnsupportedOperationException {
final DigestURL parentTarURL = createParentTarURL(location);
final DigestURL parentTarURL = createParentTarURL(location);
final TarArchiveInputStream tis = new TarArchiveInputStream(source);
final TarArchiveInputStream tis = new TarArchiveInputStream(source);
// create maindoc for this tar container
final Document maindoc = createMainDocument(location, mimeType, charset, this);
// create maindoc for this tar container
final Document maindoc = createMainDocument(location, mimeType, charset, this);
// loop through the elements in the tar file and parse every single file inside
TarArchiveEntry entry;
int totalProcessedLinks = 0;
while (true) {
try {
entry = tis.getNextTarEntry();
if (entry == null) {
break;
}
// loop through the elements in the tar file and parse every single file inside
TarArchiveEntry entry;
int totalProcessedLinks = 0;
while (true) {
try {
entry = tis.getNextTarEntry();
if (entry == null) {
break;
}
/*
/*
* We are here sure at least one entry has still to be processed : let's check
* now the bytes limit as sub parsers applied on eventual previous entries may
* not support partial parsing and would have thrown a Parser.Failure instead of
* marking the document as partially parsed.
*/
if (tis.getBytesRead() >= maxBytes) {
maindoc.setPartiallyParsed(true);
break;
}
if (tis.getBytesRead() >= maxBytes) {
maindoc.setPartiallyParsed(true);
break;
}
if (entry.isDirectory() || entry.getSize() <= 0) {
continue;
}
final String name = entry.getName();
final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
try {
/*
if (entry.isDirectory() || entry.getSize() <= 0) {
continue;
}
final String name = entry.getName();
final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx > -1) ? name.substring(idx + 1) : "");
try {
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
/*
/*
* Create an appropriate sub location to prevent unwanted fallback to the
* tarparser on resources included in the archive. We use the tar file name as
* the parent sub path. Example : http://host/archive.tar/name. Indeed if we
@ -188,66 +190,66 @@ public class tarParser extends AbstractParser implements Parser {
* http://host/archive.tar#name, the extension of the URL is still ".tar", thus
* incorrectly making the tar parser as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseWithLimits(subLocation, mime, null, timezoneOffset, 999,
entry.getSize(), tis, maxLinks - totalProcessedLinks, maxBytes - tis.getBytesRead());
/*
/*
* If the parser(s) did not consume all bytes in the entry, these ones will be
* skipped by the next call to getNextTarEntry()
*/
if (subDocs == null) {
continue;
}
maindoc.addSubDocuments(subDocs);
for (Document subDoc : subDocs) {
if (subDoc.getAnchors() != null) {
totalProcessedLinks += subDoc.getAnchors().size();
}
}
/*
if (subDocs == null) {
continue;
}
maindoc.addSubDocuments(subDocs);
for (Document subDoc : subDocs) {
if (subDoc.getAnchors() != null) {
totalProcessedLinks += subDoc.getAnchors().size();
}
}
/*
* Check if a limit has been exceeded (we are sure to pass here when maxLinks
* has been exceeded as this limit require parser support for partial parsing to
* be detected)
*/
if (subDocs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
break;
}
} catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
}
} catch (final IOException e) {
AbstractParser.log.warn("tar parser:" + e.getMessage());
break;
}
}
return new Document[] { maindoc };
}
if (subDocs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
break;
}
} catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
}
} catch (final IOException e) {
AbstractParser.log.warn("tar parser:" + e.getMessage());
break;
}
}
return new Document[] { maindoc };
}
/**
/**
* Generate a parent URL to use for generating sub URLs on tar archive entries.
*
* @param tarURL
* the URL of the tar archive
* @return an URL ending with a "/" suitable as a base URL for archive entries
*/
private DigestURL createParentTarURL(final DigestURL tarURL) {
String locationStr = tarURL.toNormalform(false);
if (!locationStr.endsWith("/")) {
locationStr += "/";
}
DigestURL parentTarURL;
try {
parentTarURL = new DigestURL(locationStr);
} catch (MalformedURLException e1) {
/* This should not happen */
parentTarURL = tarURL;
}
return parentTarURL;
}
private DigestURL createParentTarURL(final DigestURL tarURL) {
String locationStr = tarURL.toNormalform(false);
if (!locationStr.endsWith("/")) {
locationStr += "/";
}
DigestURL parentTarURL;
try {
parentTarURL = new DigestURL(locationStr);
} catch (MalformedURLException e1) {
/* This should not happen */
parentTarURL = tarURL;
}
return parentTarURL;
}
/**
/**
* Create the main resulting parsed document for a tar container
*
* @param location
@ -261,15 +263,15 @@ public class tarParser extends AbstractParser implements Parser {
* the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final tarParser parser) {
final String filename = location.getFileName();
final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
return maindoc;
}
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final tarParser parser) {
final String filename = location.getFileName();
final Document maindoc = new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
return maindoc;
}
public final static boolean isTar(File f) {
if (!f.exists() || f.length() < 0x105) return false;

@ -39,6 +39,7 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -72,7 +73,8 @@ public class zipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -121,7 +123,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, defaultValency, valencySwitchTagNames, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue;
maindoc.addSubDocuments(docs);
} catch (final Parser.Failure e) {

@ -626,6 +626,7 @@ public class Crawler_p {
cachePolicy,
collection,
agentName,
TagValency.EVAL,
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);

@ -43,6 +43,7 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.TagValency;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
@ -161,7 +162,7 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null, null,
TagValency.EVAL, null, null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {

@ -709,7 +709,16 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.getContentType());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
documents = TextParser.parseSource(
url,
responseHeader.getContentType(),
responseHeader.getCharacterEncoding(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
timezoneOffset,
response.depth(),
response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -2945,7 +2945,8 @@ public final class Switchboard extends serverSwitch {
documents = TextParser.genericParseSource(new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
@ -2963,7 +2964,8 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().defaultValency(),
response.profile().valencySwitchTagNames(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),

@ -45,6 +45,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.WebgraphConfiguration;
@ -163,7 +164,7 @@ public class DocumentIndex extends Segment {
InputStream sourceStream = null;
try {
sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
documents = TextParser.parseSource(url, null, null, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} finally {

Loading…
Cancel
Save