added a crawl filter based on <div> tag class names

When a crawl is started, a new field to exclude content from scraping is
available. The field can be identified with the class name of div tags.
All text contained in such a div tag where the configured class name(s)
match are not indexed, while the remaining page is indexed.
pull/149/head
Michael Peter Christen 7 years ago
parent 607b39b427
commit 25573bd5ab

@ -366,6 +366,18 @@
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<dl>
<dt>Filter div class names</dt>
<dd>
<table border="0">
<tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr>
</table>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Clean-Up before Crawl Start</legend>
<dl>

@ -513,6 +513,14 @@ public class CrawlStartExpert {
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name
if (post != null && post.containsKey("ignoreclassname")) {
prop.put("ignoreclassname",
post.get("ignoreclassname", ""));
} else {
prop.put("ignoreclassname", "");
}
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {

@ -468,6 +468,15 @@ public class Crawler_p {
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
String ignoreclassname_s = post.get("ignoreclassname");
Set<String> ignoreclassname = new HashSet<>();
if (ignoreclassname_s != null) {
String[] ignoreclassname_a = ignoreclassname_s.trim().split(",");
for (int i = 0; i < ignoreclassname_a.length; i++) {
ignoreclassname.add(ignoreclassname_a[i].trim());
}
}
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for (String key: post.keySet()) {
@ -552,6 +561,7 @@ public class Crawler_p {
cachePolicy,
collection,
agentName,
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
handle = ASCII.getBytes(profile.handle());
@ -646,7 +656,7 @@ public class Crawler_p {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new VocabularyScraper(), profile.timezoneOffset());
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
@ -784,7 +794,7 @@ public class Crawler_p {
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */

@ -159,7 +159,7 @@ public class QuickCrawlLink_p {
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
timezoneOffset);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {

@ -297,7 +297,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
@ -330,7 +330,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
@ -362,7 +362,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
@ -394,7 +394,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
@ -426,7 +426,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
@ -458,7 +458,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@ -491,7 +491,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
@ -523,7 +523,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
@ -555,7 +555,7 @@ public final class CrawlSwitchboard {
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
@ -587,7 +587,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
@ -622,7 +622,7 @@ public final class CrawlSwitchboard {
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile);

@ -28,10 +28,12 @@ package net.yacy.crawler.data;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
@ -44,6 +46,8 @@ import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.JSONArray;
import net.yacy.cora.util.JSONTokener;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
@ -96,6 +100,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
IGNORE_DIV_CLASS_NAME ("ignore_class_name", false, CrawlAttribute.STRING, "Ignore DIV Class names"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
@ -128,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final Set<String> ignore_class_name;
private final VocabularyScraper scraper;
/**
@ -190,6 +196,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset) {
super(40);
@ -230,9 +237,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
// we transform the ignore_class_name and scraper information into a JSON Array
this.ignore_class_name = ignore_class_name == null ? new HashSet<String>() : ignore_class_name;
String jsonString = new JSONArray(ignore_class_name).toString();
put(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key, jsonString);
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
String jsonString = this.scraper.toString();
jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(CrawlAttribute.SCRAPER.key, jsonString);
put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
@ -246,10 +256,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(CrawlAttribute.SCRAPER.key);
String jsonString = ext.get(CrawlAttribute.IGNORE_DIV_CLASS_NAME.key);
JSONArray a = jsonString == null ? new JSONArray() : new JSONArray(new JSONTokener(jsonString));
this.ignore_class_name = new HashSet<String>();
for (int i = 0; i < a.length(); i++) this.ignore_class_name.add(a.getString(i));
jsonString = ext.get(CrawlAttribute.SCRAPER.key);
this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
}
public Set<String> ignoreDivClassName() {
return this.ignore_class_name;
}
public VocabularyScraper scraper() {
return this.scraper;
}
@ -798,4 +816,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
}
public static void main(String[] args) {
// test to convert the key set from set to string and back
Set<String> a = new HashSet<>();
a.add("eins"); a.add("zwei"); a.add("drei");
JSONArray j = new JSONArray(a);
String s = j.toString();
System.out.println(s);
JSONTokener o = new JSONTokener(s);
j = new JSONArray(o);
System.out.println(j);
Set<String> h = new HashSet<String>();
for (int i = 0; i < j.length(); i++) h.add(j.getString(i));
System.out.println(h);
}
}

@ -28,6 +28,7 @@ package net.yacy.crawler.retrieval;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import net.yacy.cora.document.analysis.Classification;
@ -861,7 +862,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.getContentType());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.getContentType(), this.responseHeader == null ? StandardCharsets.UTF_8.name() : this.responseHeader.getCharacterEncoding(), new HashSet<String>(), new VocabularyScraper(), this.request.timezoneOffset(), this.request.depth(), this.content);
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {

@ -135,7 +135,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper(), 0);
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);

@ -190,7 +190,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
null,
null, null,
0); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(

@ -50,6 +50,70 @@ public abstract class AbstractParser implements Parser {
this.name = name;
}
/*
* The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class.
*/
@Override
public Document[] parse(
DigestURL url,
String mimeType,
String charset,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source);
}
@Override
public Document[] parse(
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException {
return parse(url, mimeType, charset, scraper, timezoneOffset, source);
}
/*
* The following abstract implementations create a circular call which would cause an endless loop when called.
* They are both here because one of them must be overridden by the implementing class.
*/
@Override
public Document[] parseWithLimits(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source,
final int maxLinks,
final long maxBytes) throws UnsupportedOperationException, Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, new HashSet<String>(), scraper, timezoneOffset, source, maxLinks, maxBytes);
}
@Override
public Document[] parseWithLimits(
DigestURL location,
String mimeType,
String charset,
final Set<String> ignore_class_name,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source,
int maxLinks,
long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, maxLinks, maxBytes);
}
/**
* return the name of the parser
*/
@ -101,14 +165,6 @@ public abstract class AbstractParser implements Parser {
return c;
}
@Override
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
/* Please override on subclasses when implementation is possible */
throw new UnsupportedOperationException();
}
@Override
public boolean isParseWithLimitsSupported() {
/* Please override on subclasses when parseWithLimits is supported */

@ -28,6 +28,7 @@ import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.Parser.Failure;
public interface Parser {
@ -64,6 +65,16 @@ public interface Parser {
InputStream source
) throws Parser.Failure, InterruptedException;
public Document[] parse(
DigestURL url,
String mimeType,
String charset,
Set<String> ignore_class_name,
VocabularyScraper scraper,
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
@ -103,10 +114,17 @@ public interface Parser {
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset,
VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* @return true when the parser implementation supports the
* parseWithLimits() operation.

@ -182,6 +182,7 @@ public final class TextParser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -198,7 +199,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -215,6 +216,7 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -232,7 +234,7 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs;
}
@ -241,6 +243,7 @@ public final class TextParser {
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -302,7 +305,7 @@ public final class TextParser {
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try {
return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset,
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes);
} catch (Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
@ -364,15 +367,16 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs;
}
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, contentLength, sourceStream,
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE);
}
@ -397,7 +401,7 @@ public final class TextParser {
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@ -420,6 +424,7 @@ public final class TextParser {
final String mimeType,
final Parser parser,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream,
@ -435,11 +440,11 @@ public final class TextParser {
try {
final Document[] docs;
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else {
/* Parser do not support partial parsing within limits : let's control it here*/
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
}
return docs;
} catch(Parser.Failure e) {
@ -468,6 +473,7 @@ public final class TextParser {
final String mimeType,
final Set<Parser> parsers,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
@ -495,13 +501,13 @@ public final class TextParser {
}
try {
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes);
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else {
/* Partial parsing is not supported by this parser : check content length now */
if(sourceArray.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
}
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
}
} catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&

@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
@ -534,7 +535,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);

@ -33,6 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Set;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
@ -69,6 +70,7 @@ public class bzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -123,7 +125,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
final Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -33,6 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
@ -71,6 +72,7 @@ public class gzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
@ -126,7 +128,7 @@ public class gzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
Document[] docs = TextParser.parseSource(location, mime, null, ignore_class_name, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -209,6 +209,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final int maxAnchors;
private final VocabularyScraper vocabularyScraper;
private final Set<String> ignore_class_name;
private final int timezoneOffset;
private int breadcrumbs;
@ -241,13 +242,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param timezoneOffset local time zone offset
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.vocabularyScraper = vocabularyScraper;
this.ignore_class_name = ignore_class_name;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -294,8 +296,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
public ContentScraper(final DigestURL root, final int maxLinks, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, vocabularyScraper, timezoneOffset);
public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
}
@Override
@ -835,11 +837,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
final String h;
if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
final String classn = tag.opts.getProperty("class", EMPTY_STRING);
if (classn.length() > 0 && this.ignore_class_name.contains(classn)) {
// we remove everything inside that tag, so it can be ignored
tag.content.clear();
} else {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
}
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
@ -1477,13 +1485,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

@ -35,6 +35,7 @@ import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
@ -61,6 +62,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final Set<String> ignore_class_name,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl,
final Transformer transformer,
@ -72,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this);
try {

@ -36,8 +36,10 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Set;
import org.apache.commons.io.IOUtils;
@ -105,7 +107,20 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
return parseWithLimits(location, mimeType, documentCharset, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String documentCharset,
final Set<String> ignore_class_name,
final VocabularyScraper vocscraper,
final int timezoneOffset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, DEFAULT_MAX_LINKS, Long.MAX_VALUE);
}
@Override
@ -114,19 +129,20 @@ public class htmlParser extends AbstractParser implements Parser {
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxLinks, final long maxBytes)
throws Failure {
return parseWithLimits(location, mimeType, documentCharset, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
return parseWithLimits(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
}
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final VocabularyScraper vocscraper,
private Document[] parseWithLimits(final DigestURL location, final String mimeType, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final InputStream sourceStream, final int maxAnchors, final int maxLinks, final long maxBytes)
throws Failure {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
ContentScraper scraper = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -135,10 +151,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if (location.getRef() != null && location.getRef().startsWith("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, ignore_class_name, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
}
}
} catch (Exception ex1) { // ignore any exception for any issue with snapshot
@ -203,7 +219,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, final int timezoneOffset, final String input, final int maxAnchors, final int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -213,7 +229,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
scraper = parseToScraper(location, documentCharset, ignore_class_name, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -238,6 +254,7 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final Set<String> ignore_class_name,
final VocabularyScraper vocabularyScraper,
final Charset[] detectedcharsetcontainer,
final int timezoneOffset,
@ -258,7 +275,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, null, false, maxLinks, timezoneOffset);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -293,7 +310,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
@ -420,8 +437,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
*/
private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
final VocabularyScraper vocscraper, final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
private Document parseAlternativeSnapshot(
final DigestURL location, final String mimeType, final String documentCharset,
final Set<String> ignore_class_name, final VocabularyScraper vocscraper,
final int timezoneOffset, final int maxAnchors, final int maxLinks, final long maxBytes) {
Document documentSnapshot = null;
try {
// construct url for case (1) with anchor
@ -440,7 +459,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null;
try {
snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, ignore_class_name, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
} finally {
if(snapshotStream != null) {

@ -33,6 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
@ -62,6 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
@ -92,7 +94,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
}
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), timezoneOffset);
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), ignore_class_name, timezoneOffset);
AbstractParser.log.fine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
@ -114,9 +116,10 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, timezoneOffset, new ByteArrayIInStream(source));
return parse(location, mimeType, charset, ignore_class_name, timezoneOffset, new ByteArrayIInStream(source));
}
@Override
@ -124,13 +127,14 @@ public class sevenzipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, timezoneOffset, cfos.toByteArray())};
return new Document[]{parse(location, mimeType, charset, ignore_class_name, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
}
@ -144,6 +148,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
private Set<String> ignore_class_name;
private final int timezoneOffset;
public SZParserExtractCallback(
@ -151,11 +156,13 @@ public class sevenzipParser extends AbstractParser implements Parser {
final IInArchive handler,
final Document doc,
final String prefix,
final Set<String> ignore_class_name,
final int timezoneOffset) {
super.Init(handler);
this.log = logger;
this.doc = doc;
this.prefix = prefix;
this.ignore_class_name = ignore_class_name;
this.timezoneOffset = timezoneOffset;
}
@ -198,7 +205,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null, this.ignore_class_name, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

@ -31,6 +31,7 @@ import java.io.InputStream;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
@ -69,6 +70,7 @@ public class tarParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException {
@ -110,7 +112,7 @@ public class tarParser extends AbstractParser implements Parser {
* as a possible parser for the sub resource.
*/
final DigestURL subLocation = new DigestURL(parentTarURL, name);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, scraper, timezoneOffset, 999, tmp);
final Document[] subDocs = TextParser.parseSource(subLocation, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
if (subDocs == null) {
continue;
}

@ -28,6 +28,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@ -69,6 +70,7 @@ public class zipParser extends AbstractParser implements Parser {
final DigestURL location,
final String mimeType,
final String charset,
final Set<String> ignore_class_name,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
@ -117,7 +119,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, scraper, timezoneOffset, 999, tmp);
final Document[] docs = TextParser.parseSource(virtualURL, mime, null, ignore_class_name, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue;
maindoc.addSubDocuments(docs);
} catch (final Parser.Failure e) {

@ -688,7 +688,12 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<AnchorURL, String> loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent, final int timezoneOffset) throws IOException {
public final Map<AnchorURL, String> loadLinks(
final DigestURL url,
final CacheStrategy cacheStrategy,
BlacklistType blacklistType,
final ClientIdentification.Agent agent,
final int timezoneOffset) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
@ -699,7 +704,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.getContentType());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
documents = TextParser.parseSource(url, responseHeader.getContentType(), responseHeader.getCharacterEncoding(), response.profile().ignoreDivClassName(), response.profile().scraper(), timezoneOffset, response.depth(), response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -2951,6 +2951,7 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
@ -162,7 +163,7 @@ public class DocumentIndex extends Segment {
InputStream sourceStream = null;
try {
sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
documents = TextParser.parseSource(url, null, null, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, 0, length, sourceStream);
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
} finally {

@ -29,6 +29,7 @@ import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -145,7 +146,7 @@ public class ContentScraperTest {
+ "<time datetime='2016-12-23'>23. Dezember 2016</time>" // html5 time tag
+ "</body></html>";
ContentScraper scraper = new ContentScraper(root, 10, new VocabularyScraper(), 0);
ContentScraper scraper = new ContentScraper(root, 10, new HashSet<String>(), new VocabularyScraper(), 0);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new StringReader(page), writer);

@ -10,6 +10,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
@ -265,7 +266,7 @@ public class htmlParserTest extends TestCase {
+ "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);
List<AnchorURL> anchorlist = scraper.getAnchors();
String linktxt = anchorlist.get(0).getTextProperty();
@ -307,7 +308,7 @@ public class htmlParserTest extends TestCase {
}
testHtml.append("</p></body></html>");
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testHtml.toString(), Integer.MAX_VALUE, Integer.MAX_VALUE);
assertEquals(nestingDepth, scraper.getAnchors().size());
assertEquals(1, scraper.getImages().size());
@ -328,7 +329,7 @@ public class htmlParserTest extends TestCase {
+ "<p>" + textSource + "</p>"
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
@ -357,7 +358,7 @@ public class htmlParserTest extends TestCase {
+ "</head>\n"
+ "<body>" + textSource + "</body>\n"
+ "</html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10, 10);
ContentScraper scraper = parseToScraper(url, charset, new HashSet<String>(), new VocabularyScraper(), 0, testhtml, 10, 10);
String txt = scraper.getText();
System.out.println("ScraperScriptTagTest: [" + textSource + "] = [" + txt + "]");

Loading…
Cancel
Save