introduction of tag-to-indexing relation TagValency

pull/554/head
Michael Peter Christen 2 years ago
parent 95e02e5291
commit 5acd98f4da

@ -60,6 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag; import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -135,7 +136,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try { try {
//load the links //load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0); final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false); final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(input,writer); FileUtils.copy(input,writer);

@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String name; public String name;
public Properties opts; public Properties opts;
public CharBuffer content; public CharBuffer content;
private TagValency tv;
/** Set to true when this tag should be ignored from scraping */ public Tag(final String name, TagValency defaultValency) {
private boolean ignore = false;
public Tag(final String name) {
this.name = name; this.name = name;
this.tv = defaultValency;
this.opts = new Properties(); this.opts = new Properties();
this.content = new CharBuffer(MAX_TAGSIZE); this.content = new CharBuffer(MAX_TAGSIZE);
} }
public Tag(final String name, final Properties opts) { public Tag(final String name, TagValency defaultValency, final Properties opts) {
this.name = name; this.name = name;
this.tv = defaultValency;
this.opts = opts; this.opts = opts;
this.content = new CharBuffer(MAX_TAGSIZE); this.content = new CharBuffer(MAX_TAGSIZE);
} }
public Tag(final String name, final Properties opts, final CharBuffer content) { public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
this.name = name; this.name = name;
this.tv = defaultValency;
this.opts = opts; this.opts = opts;
this.content = content; this.content = content;
} }
@ -178,14 +178,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/** @return true when this tag should be ignored from scraping */ /** @return true when this tag should be ignored from scraping */
public boolean isIgnore() { public boolean isIgnore() {
return this.ignore; return this.tv == TagValency.IGNORE;
} }
public TagValency getValency() {
/** return this.tv;
* @param ignore true when this tag should be ignored from scraping }
*/ public void setValency(final TagValency tv) {
public void setIgnore(final boolean ignore) { this.tv = tv;
this.ignore = ignore;
} }
} }
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final VocabularyScraper vocabularyScraper; private final VocabularyScraper vocabularyScraper;
/** Set of CSS class names whose matching div elements content should be ignored */ /** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
private final Set<String> ignoreDivClassNames; private final Set<String> valencySwitchTagNames;
private final TagValency defaultValency;
private final int timezoneOffset; private final int timezoneOffset;
private int breadcrumbs; private int breadcrumbs;
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param root the document root url * @param root the document root url
* @param maxAnchors the maximum number of URLs to process and store in the anchors property. * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored * @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param defaultValency the valency default; should be TagValency.EVAL by default
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset * @param timezoneOffset local time zone offset
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) { public ContentScraper(
final DigestURL root,
final int maxAnchors,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
assert root != null; assert root != null;
this.root = root; this.root = root;
this.vocabularyScraper = vocabularyScraper; this.vocabularyScraper = vocabularyScraper;
this.ignoreDivClassNames = ignoreDivClassNames; this.valencySwitchTagNames = valencySwitchTagNames;
this.defaultValency = defaultValency;
this.timezoneOffset = timezoneOffset; this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks); this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -321,8 +330,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset * @param timezoneOffset local time zone offset
*/ */
public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) { public ContentScraper(
this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset); final DigestURL root,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
}
public TagValency defaultValency() {
return this.defaultValency;
} }
@Override @Override
@ -333,7 +352,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override @Override
public void scrapeText(final char[] newtext0, final Tag insideTag) { public void scrapeText(final char[] newtext0, final Tag insideTag) {
if (insideTag != null) { if (insideTag != null) {
if(insideTag.ignore) { if (insideTag.tv == TagValency.IGNORE) {
return; return;
} }
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) { if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
@ -720,7 +739,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/ */
@Override @Override
public void scrapeTag0(final Tag tag) { public void scrapeTag0(final Tag tag) {
if(tag.ignore) { if (tag.tv == TagValency.IGNORE) {
return; return;
} }
checkOpts(tag); checkOpts(tag);
@ -893,7 +912,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/ */
@Override @Override
public void scrapeTag1(final Tag tag) { public void scrapeTag1(final Tag tag) {
if(tag.ignore) { if (tag.tv == TagValency.IGNORE) {
return; return;
} }
checkOpts(tag); checkOpts(tag);
@ -1003,7 +1022,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/ */
@Override @Override
public void scrapeAnyTagOpening(final Tag tag) { public void scrapeAnyTagOpening(final Tag tag) {
if (tag != null && !tag.ignore && tag.opts != null) { if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
/* /*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this * HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1 * scraping to the limited sets in linkTags0 and linkTags1
@ -1013,24 +1032,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
@Override @Override
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) { public TagValency tagValency(final Tag tag, final Tag parentTag) {
boolean ignore = false; if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
/* First, inherit ignore property from eventual parent */
if(parentTag != null) {
ignore = parentTag.ignore;
}
/* Parent is not marked as ignored : let's check the current tag */ if (this.valencySwitchTagNames != null &&
if (!ignore &&
this.ignoreDivClassNames != null &&
tag != null && tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) { (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING); final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr); final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes); if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
} }
return ignore; return this.defaultValency;
} }
/** /**
@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString()); if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset // scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset); final ScraperInputStream htmlFilter = new ScraperInputStream(
new ByteArrayInputStream(page),
StandardCharsets.UTF_8.name(),
new HashSet<String>(), TagValency.EVAL,
new VocabularyScraper(),
new DigestURL("http://localhost"),
false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close(); htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString(); if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content // scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset); final ContentScraper scraper = new ContentScraper(
new DigestURL("http://localhost"),
maxLinks,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false); final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close(); writer.close();

@ -80,7 +80,9 @@ public interface Scraper {
* @return true when the tag should be ignored according to the scraper * @return true when the tag should be ignored according to the scraper
* implementation rules * implementation rules
*/ */
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag); public TagValency tagValency(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public TagValency defaultValency();
public void scrapeComment(final char[] comment); public void scrapeComment(final char[] comment);

@ -62,7 +62,8 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream( public ScraperInputStream(
final InputStream inStream, final InputStream inStream,
final String inputStreamCharset, final String inputStreamCharset,
final Set<String> ignore_class_name, final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
final DigestURL rooturl, final DigestURL rooturl,
final boolean passbyIfBinarySuspect, final boolean passbyIfBinarySuspect,
@ -73,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize); this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset); final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this); scraper.registerHtmlFilterEventListener(this);
try { try {

@ -0,0 +1,30 @@
/**
* TagValency
* Copyright 2023 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
public enum TagValency {
IGNORE, // do not index that tag
EVAL; // do index that tag
public TagValency reverse() {
return this == IGNORE ? EVAL : IGNORE;
}
}

@ -283,7 +283,7 @@ public final class TransformerWriter extends Writer {
private char[] filterTagOpening(final String tagname, final char[] content) { private char[] filterTagOpening(final String tagname, final char[] content) {
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser()); ContentScraper.Tag tag = new ContentScraper.Tag(tagname, this.scraper.defaultValency(), charBuffer.propParser());
charBuffer.close(); charBuffer.close();
final ContentScraper.Tag parentTag; final ContentScraper.Tag parentTag;
@ -294,8 +294,8 @@ public final class TransformerWriter extends Writer {
} }
/* Check scraper ignoring rules */ /* Check scraper ignoring rules */
if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) { if (this.scraper != null) {
tag.setIgnore(true); tag.setValency(this.scraper.tagValency(tag, parentTag));
} }
/* Apply processing relevant for any kind of tag opening */ /* Apply processing relevant for any kind of tag opening */

@ -57,6 +57,7 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
@ -276,7 +277,16 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) { if (charset == null) {
ScraperInputStream htmlFilter = null; ScraperInputStream htmlFilter = null;
try { try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset); htmlFilter = new ScraperInputStream(
sourceStream,
documentCharset,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
location,
false,
maxLinks,
timezoneOffset);
sourceStream = htmlFilter; sourceStream = htmlFilter;
charset = htmlFilter.detectCharset(); charset = htmlFilter.detectCharset();
} catch (final IOException e1) { } catch (final IOException e1) {
@ -311,7 +321,14 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content // parsing the content
// for this static method no need to init local this.scraperObject here // for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset); final ContentScraper scraper = new ContentScraper(
location,
maxAnchors,
maxLinks,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available()))); final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try { try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte()); final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());

@ -66,6 +66,7 @@ import net.yacy.data.WorkTables;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -733,8 +734,13 @@ public class Crawler_p {
} else { } else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", ""); final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, final ContentScraper scraper = new ContentScraper(
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset()); new DigestURL(crawlingFile),
10000000,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
profile.timezoneOffset());
final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile); sb.crawler.putActive(handle, profile);
@ -874,7 +880,7 @@ public class Crawler_p {
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException { final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file; List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly // check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset); final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false); final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */ /* Let's report here detailed error to help user when he selected a wrong file */

Loading…
Cancel
Save