introduction of tag-to-indexing relation TagValency

pull/554/head
Michael Peter Christen 2 years ago
parent 95e02e5291
commit 5acd98f4da

@ -60,6 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.FileUtils;
@ -135,7 +136,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), new VocabularyScraper(), 0);
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), 0);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(input,writer);

@ -146,22 +146,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String name;
public Properties opts;
public CharBuffer content;
/** Set to true when this tag should be ignored from scraping */
private boolean ignore = false;
public Tag(final String name) {
private TagValency tv;
public Tag(final String name, TagValency defaultValency) {
this.name = name;
this.tv = defaultValency;
this.opts = new Properties();
this.content = new CharBuffer(MAX_TAGSIZE);
}
public Tag(final String name, final Properties opts) {
public Tag(final String name, TagValency defaultValency, final Properties opts) {
this.name = name;
this.tv = defaultValency;
this.opts = opts;
this.content = new CharBuffer(MAX_TAGSIZE);
}
public Tag(final String name, final Properties opts, final CharBuffer content) {
public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) {
this.name = name;
this.tv = defaultValency;
this.opts = opts;
this.content = content;
}
@ -178,14 +178,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/** @return true when this tag should be ignored from scraping */
public boolean isIgnore() {
return this.ignore;
return this.tv == TagValency.IGNORE;
}
/**
* @param ignore true when this tag should be ignored from scraping
*/
public void setIgnore(final boolean ignore) {
this.ignore = ignore;
public TagValency getValency() {
return this.tv;
}
public void setValency(final TagValency tv) {
this.tv = tv;
}
}
@ -230,8 +229,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final VocabularyScraper vocabularyScraper;
/** Set of CSS class names whose matching div elements content should be ignored */
private final Set<String> ignoreDivClassNames;
/** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */
private final Set<String> valencySwitchTagNames;
private final TagValency defaultValency;
private final int timezoneOffset;
private int breadcrumbs;
@ -261,19 +261,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param root the document root url
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param valencySwitchTagNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param defaultValency the valency default; should be TagValency.EVAL by default
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
public ContentScraper(
final DigestURL root,
final int maxAnchors,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.vocabularyScraper = vocabularyScraper;
this.ignoreDivClassNames = ignoreDivClassNames;
this.valencySwitchTagNames = valencySwitchTagNames;
this.defaultValency = defaultValency;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -321,8 +330,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
public ContentScraper(final DigestURL root, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
public ContentScraper(
final DigestURL root,
final int maxLinks,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
}
public TagValency defaultValency() {
return this.defaultValency;
}
@Override
@ -333,7 +352,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeText(final char[] newtext0, final Tag insideTag) {
if (insideTag != null) {
if(insideTag.ignore) {
if (insideTag.tv == TagValency.IGNORE) {
return;
}
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
@ -720,7 +739,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag0(final Tag tag) {
if(tag.ignore) {
if (tag.tv == TagValency.IGNORE) {
return;
}
checkOpts(tag);
@ -893,7 +912,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag1(final Tag tag) {
if(tag.ignore) {
if (tag.tv == TagValency.IGNORE) {
return;
}
checkOpts(tag);
@ -1003,7 +1022,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeAnyTagOpening(final Tag tag) {
if (tag != null && !tag.ignore && tag.opts != null) {
if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
/*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1
@ -1013,24 +1032,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
boolean ignore = false;
/* First, inherit ignore property from eventual parent */
if(parentTag != null) {
ignore = parentTag.ignore;
}
public TagValency tagValency(final Tag tag, final Tag parentTag) {
if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv;
/* Parent is not marked as ignored : let's check the current tag */
if (!ignore &&
this.ignoreDivClassNames != null &&
if (this.valencySwitchTagNames != null &&
tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
if (!Collections.disjoint(this.valencySwitchTagNames, classes)) return this.defaultValency.reverse();
}
return ignore;
return this.defaultValency;
}
/**
@ -1604,13 +1616,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
final ScraperInputStream htmlFilter = new ScraperInputStream(
new ByteArrayInputStream(page),
StandardCharsets.UTF_8.name(),
new HashSet<String>(), TagValency.EVAL,
new VocabularyScraper(),
new DigestURL("http://localhost"),
false, maxLinks, timezoneOffset);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(
new DigestURL("http://localhost"),
maxLinks,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

@ -80,7 +80,9 @@ public interface Scraper {
* @return true when the tag should be ignored according to the scraper
* implementation rules
*/
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public TagValency tagValency(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public TagValency defaultValency();
public void scrapeComment(final char[] comment);

@ -62,7 +62,8 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final Set<String> ignore_class_name,
final Set<String> valencySwitchTagNames,
final TagValency defaultValency,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl,
final boolean passbyIfBinarySuspect,
@ -73,7 +74,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
scraper.registerHtmlFilterEventListener(this);
try {

@ -0,0 +1,30 @@
/**
* TagValency
* Copyright 2023 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
public enum TagValency {
IGNORE, // do not index that tag
EVAL; // do index that tag
public TagValency reverse() {
return this == IGNORE ? EVAL : IGNORE;
}
}

@ -283,7 +283,7 @@ public final class TransformerWriter extends Writer {
private char[] filterTagOpening(final String tagname, final char[] content) {
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, this.scraper.defaultValency(), charBuffer.propParser());
charBuffer.close();
final ContentScraper.Tag parentTag;
@ -294,8 +294,8 @@ public final class TransformerWriter extends Writer {
}
/* Check scraper ignoring rules */
if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
tag.setIgnore(true);
if (this.scraper != null) {
tag.setValency(this.scraper.tagValency(tag, parentTag));
}
/* Apply processing relevant for any kind of tag opening */

@ -57,6 +57,7 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
@ -276,7 +277,16 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, ignore_class_name, vocabularyScraper, location, false, maxLinks, timezoneOffset);
htmlFilter = new ScraperInputStream(
sourceStream,
documentCharset,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
location,
false,
maxLinks,
timezoneOffset);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -311,7 +321,14 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, ignore_class_name, vocabularyScraper, timezoneOffset);
final ContentScraper scraper = new ContentScraper(
location,
maxAnchors,
maxLinks,
ignore_class_name,
TagValency.EVAL,
vocabularyScraper,
timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
final long maxChars = (long)(maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());

@ -66,6 +66,7 @@ import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
@ -733,8 +734,13 @@ public class Crawler_p {
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
final ContentScraper scraper = new ContentScraper(
new DigestURL(crawlingFile),
10000000,
new HashSet<String>(),
TagValency.EVAL,
new VocabularyScraper(),
profile.timezoneOffset());
final FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
@ -874,7 +880,7 @@ public class Crawler_p {
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), new VocabularyScraper(), timezoneOffset);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */

Loading…
Cancel
Save