added nav filter

pull/167/head
Michael Peter Christen 7 years ago
parent bcbd0ae1a4
commit 187075b878

@ -24,12 +24,10 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.AbstractMap;
import java.util.Collection; import java.util.Collection;
import java.util.ConcurrentModificationException; import java.util.ConcurrentModificationException;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
@ -39,9 +37,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;

@ -370,10 +370,10 @@
<legend>Content Filter</legend> <legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p> <p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<dl> <dl>
<dt>Filter div class names</dt> <dt>Filter div or nav class names</dt>
<dd> <dd>
<table style="border-width: 0px"> <table style="border-width: 0px">
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; element class names which should be filtered out</td></tr> <tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr>
</table> </table>
</dd> </dd>
</dl> </dl>

@ -152,6 +152,7 @@ public class Scanner {
if (access != Access.unknown) Scanner.this.services.put(this, access); if (access != Access.unknown) Scanner.this.services.put(this, access);
} }
} catch (final OutOfMemoryError e) { } catch (final OutOfMemoryError e) {
e.printStackTrace();
} }
} }
public long age() { public long age() {

@ -129,6 +129,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair), script(TagType.pair),
span(TagType.pair), span(TagType.pair),
div(TagType.pair), div(TagType.pair),
nav(TagType.pair),
article(TagType.pair), // html5 article(TagType.pair), // html5
time(TagType.pair), // html5 <time datetime> time(TagType.pair), // html5 <time datetime>
// tags used to capture tag content // tags used to capture tag content
@ -1020,7 +1021,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
/* Parent is not marked as ignored : let's check the current tag */ /* Parent is not marked as ignored : let's check the current tag */
if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) { if (!ignore &&
this.ignoreDivClassNames != null &&
tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING); final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr); final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes); ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);

Loading…
Cancel
Save