From 1c0f50985c83ef2500399763a8d87a9ddd90da97 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 4 Apr 2023 12:41:12 +0200 Subject: [PATCH] fixed documentation and some details of handling of keywords --- defaults/solr.collection.schema | 2 +- defaults/solr/schema.xml | 2 +- locales/uk.lng | 2 +- .../document/parser/html/ContentScraper.java | 324 ++++++++++-------- .../yacy/search/schema/CollectionSchema.java | 56 +-- 5 files changed, 205 insertions(+), 181 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 89b4dd45c..ea8117759 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -180,7 +180,7 @@ description_txt ## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean #description_unique_b -## content of keywords tag; words are separated by space +## content of keywords tag; words are separated by comma, semicolon or space keywords ## character encoding, string diff --git a/defaults/solr/schema.xml b/defaults/solr/schema.xml index 288cb1f50..244928756 100644 --- a/defaults/solr/schema.xml +++ b/defaults/solr/schema.xml @@ -76,7 +76,7 @@ - + diff --git a/locales/uk.lng b/locales/uk.lng index a9164787f..b7f558dc7 100644 --- a/locales/uk.lng +++ b/locales/uk.lng @@ -4062,7 +4062,7 @@ Active==Діє Attribute==Властивість Comment==Примітка Set==Виставити -== +== #----------------------------- # EOF diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 3b77d8509..b107167d0 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -80,10 +80,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { // statics: for initialization of the HTMLFilterAbstractScraper /** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */ - private static final Set linkTags0 = new HashSet(12,0.99f); - + private static final Set linkTags0 = new HashSet<>(12,0.99f); + /** Set of tag names processed by pairs of start and end tag */ - private static final Set linkTags1 = new HashSet(15,0.99f); + private static final Set linkTags1 = new HashSet<>(15,0.99f); private static final Pattern LB = Pattern.compile("\n"); @@ -147,19 +147,19 @@ public class ContentScraper extends AbstractScraper implements Scraper { public Properties opts; public CharBuffer content; private TagValency tv; - public Tag(final String name, TagValency defaultValency) { + public Tag(final String name, final TagValency defaultValency) { this.name = name; this.tv = defaultValency; this.opts = new Properties(); this.content = new CharBuffer(MAX_TAGSIZE); } - public Tag(final String name, TagValency defaultValency, final Properties opts) { + public Tag(final String name, final TagValency defaultValency, final Properties opts) { this.name = name; this.tv = defaultValency; this.opts = opts; this.content = new CharBuffer(MAX_TAGSIZE); } - public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) { + public Tag(final String name, final TagValency defaultValency, final Properties opts, final CharBuffer content) { this.name = name; this.tv = defaultValency; this.opts = opts; @@ -173,9 +173,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override public String toString() { - return "<" + name + " " + opts + ">" + content + ""; + return "<" + this.name + " " + this.opts + ">" + this.content + ""; } - + /** @return true when this tag should be ignored from scraping */ public boolean isIgnore() { return this.tv == TagValency.IGNORE; @@ -201,18 +201,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final List anchors; private final SizeLimitedMap rss, css; private final SizeLimitedMap embeds; // urlhash/embed relation - private final List images; + private final List images; private final SizeLimitedSet script, frames, iframes; - + /** * URLs of linked data item types referenced from HTML content with standard * annotations such as RDFa, microdata, microformats or JSON-LD */ private final SizeLimitedSet linkedDataTypes; - + private final SizeLimitedMap metas; private final SizeLimitedMap hreflang, navigation; - private LinkedHashSet titles; + private final LinkedHashSet titles; private final List articles; private final List startDates, endDates; //private String headline; @@ -223,16 +223,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final EventListenerList htmlFilterEventListeners; private double lon, lat; private AnchorURL canonical, publisher; - + /** The maximum number of URLs to process and store in the anchors property. */ private final int maxAnchors; - + private final VocabularyScraper vocabularyScraper; - + /** Set of CSS class names whose matching div elements may switch from IGNORE to EVAL or vice versa */ private final Set valencySwitchTagNames; private final TagValency defaultValency; - + private final int timezoneOffset; private int breadcrumbs; @@ -249,13 +249,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { * evaluation scores: count appearance of specific attributes */ private final Evaluation evaluationScores; - + /** Set to true when a limit on content size scraped has been exceeded */ private boolean contentSizeLimitExceeded; - + /** Set to true when the maxAnchors limit has been exceeded */ private boolean maxAnchorsExceeded; - + /** * Create an ContentScraper instance * @param root the document root url @@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final Set valencySwitchTagNames, final TagValency defaultValency, final VocabularyScraper vocabularyScraper, - int timezoneOffset) { + final int timezoneOffset) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -285,31 +285,31 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.defaultValency = defaultValency; this.timezoneOffset = timezoneOffset; this.evaluationScores = new Evaluation(); - this.rss = new SizeLimitedMap(maxLinks); - this.css = new SizeLimitedMap(maxLinks); - this.anchors = new ArrayList(); - this.images = new ArrayList(); + this.rss = new SizeLimitedMap<>(maxLinks); + this.css = new SizeLimitedMap<>(maxLinks); + this.anchors = new ArrayList<>(); + this.images = new ArrayList<>(); this.icons = new HashMap<>(); - this.embeds = new SizeLimitedMap(maxLinks); - this.frames = new SizeLimitedSet(maxLinks); - this.iframes = new SizeLimitedSet(maxLinks); + this.embeds = new SizeLimitedMap<>(maxLinks); + this.frames = new SizeLimitedSet<>(maxLinks); + this.iframes = new SizeLimitedSet<>(maxLinks); this.linkedDataTypes = new SizeLimitedSet<>(maxLinks); - this.metas = new SizeLimitedMap(maxLinks); - this.hreflang = new SizeLimitedMap(maxLinks); - this.navigation = new SizeLimitedMap(maxLinks); - this.script = new SizeLimitedSet(maxLinks); - this.titles = new LinkedHashSet(); - this.articles = new ArrayList(); + this.metas = new SizeLimitedMap<>(maxLinks); + this.hreflang = new SizeLimitedMap<>(maxLinks); + this.navigation = new SizeLimitedMap<>(maxLinks); + this.script = new SizeLimitedSet<>(maxLinks); + this.titles = new LinkedHashSet<>(); + this.articles = new ArrayList<>(); this.startDates = new ArrayList<>(); this.endDates = new ArrayList<>(); this.headlines = (List[]) Array.newInstance(ArrayList.class, 6); - for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); - this.bold = new ClusteredScoreMap(false); - this.italic = new ClusteredScoreMap(false); - this.underline = new ClusteredScoreMap(false); - this.li = new ArrayList(); - this.dt = new ArrayList(); - this.dd = new ArrayList(); + for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<>(); + this.bold = new ClusteredScoreMap<>(false); + this.italic = new ClusteredScoreMap<>(false); + this.underline = new ClusteredScoreMap<>(false); + this.li = new ArrayList<>(); + this.dt = new ArrayList<>(); + this.dd = new ArrayList<>(); this.content = new CharBuffer(MAX_DOCSIZE, 1024); this.htmlFilterEventListeners = new EventListenerList(); this.lon = 0.0d; @@ -322,7 +322,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.maxAnchorsExceeded = false; this.maxAnchors = maxAnchors; } - + /** * Create an ContentScraper instance * @param root the document root url @@ -336,14 +336,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { final Set valencySwitchTagNames, final TagValency defaultValency, final VocabularyScraper vocabularyScraper, - int timezoneOffset) { + final int timezoneOffset) { this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset); } + @Override public TagValency defaultValency() { return this.defaultValency; } - + @Override public void finish() { this.content.trimToSize(); @@ -360,8 +361,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } int p, pl, q, s = 0; - char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); - + final char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); + // match evaluation pattern this.evaluationScores.match(Element.text, newtext); @@ -430,13 +431,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // find absolute URLs inside text final Object[] listeners = this.htmlFilterEventListeners.getListenerList(); - List anchorListeners = new ArrayList<>(); + final List anchorListeners = new ArrayList<>(); for (int i = 0; i < listeners.length; i += 2) { if (listeners[i] == ContentScraperListener.class) { anchorListeners.add((ContentScraperListener)listeners[i+1]); } } - + if(!this.maxAnchorsExceeded) { int maxLinksToDetect = this.maxAnchors - this.anchors.size(); if(maxLinksToDetect < Integer.MAX_VALUE) { @@ -449,7 +450,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.anchors.remove(this.anchors.size() -1); } } - + // append string to content if (!b.isEmpty()) { this.content.append(b); @@ -458,12 +459,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://"); - + /** A regular expression pattern matching any whitespace character */ private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s"); - + /** - * Try to detect and parse absolute URLs in text (at most maxURLs) , then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. + * Try to detect and parse absolute URLs in text (at most maxURLs) , then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. * @param text the text to parse * @param urls a mutable collection of URLs to fill. * @param listeners a collection of listeners to trigger. @@ -480,17 +481,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { AnchorURL url; final Matcher urlSchemeMatcher = protp.matcher(text); final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text); - + long detectedURLsCount = 0; while (offset < text.length() && detectedURLsCount < maxURLs) { if(!urlSchemeMatcher.find(offset)) { break; } schemePosition = urlSchemeMatcher.start(); - + hasWhiteSpace = whiteSpaceMatcher.find(urlSchemeMatcher.end()); urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length()); - + if (urlString.endsWith(".")) { urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above } @@ -499,7 +500,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { urlString = removeUnpairedBrackets(urlString, '(', ')'); urlString = removeUnpairedBrackets(urlString, '{', '}'); urlString = removeUnpairedBrackets(urlString, '[', ']'); - + offset = schemePosition + urlString.length(); try { url = new AnchorURL(urlString); @@ -508,7 +509,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { urls.add(url); } if(listeners != null) { - for(ContentScraperListener listener : listeners) { + for(final ContentScraperListener listener : listeners) { listener.anchorAdded(url.toNormalform(false)); } } @@ -516,9 +517,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } return detectedURLsCount; } - + /** - * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. + * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. * @param text the text to parse * @param urls a mutable collection of URLs to fill. * @param listeners a collection of listeners to trigger. @@ -530,7 +531,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { /** * Analyze bracket pairs found in the string and eventually * return a truncated version of that string when one or more pairs are incomplete - * + * * @param str * the string to analyze * @param openingMark @@ -566,7 +567,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { break; } } - + if (depth > 0) { /* One or more unpaired opening marks : truncate at the first opening level */ if(lastUnpairedOpeningIndex >= 0) { @@ -592,11 +593,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { return null; } } - + /** * Parse the eventual microdata itemtype attribute of a tag and extract its * valid URL tokens when the itemscope attribute is present. - * + * * @param tagAttributes parsed HTML tag attributes. * @return a set of URLs eventually empty when no itemtype attribute is present * or when its value is not valid @@ -629,14 +630,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { } return types; } - + private void checkOpts(final Tag tag) { // vocabulary classes final String classprop = tag.opts.getProperty("class", EMPTY_STRING); this.vocabularyScraper.check(this.root, classprop, tag.content); - + // itemprop microdata property (standard definition at https://www.w3.org/TR/microdata/#dfn-attr-itemprop) - String itemprop = tag.opts.getProperty("itemprop"); + final String itemprop = tag.opts.getProperty("itemprop"); if (itemprop != null) { String propval = tag.opts.getProperty("content"); // value for see https://html.spec.whatwg.org/multipage/microdata.html#values if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: while each prop is optional @@ -654,41 +655,41 @@ public class ContentScraper extends AbstractScraper implements Scraper { case "startDate": // try { // parse ISO 8601 date - Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); + final Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); this.startDates.add(startDate); - } catch (ParseException e) {} + } catch (final ParseException e) {} break; case "endDate": try { // parse ISO 8601 date - Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); + final Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); this.endDates.add(endDate); - } catch (ParseException e) {} + } catch (final ParseException e) {} break; } } } } - + /** * Parses sizes icon link attribute. (see * http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual * duplicates are removed. - * + * * @param sizesAttr * sizes attribute string, may be null * @return a set of sizes eventually empty. */ - public static Set parseSizes(String sizesAttr) { - Set sizes = new HashSet(); - Set tokens = parseSpaceSeparatedTokens(sizesAttr); - for (String token : tokens) { + public static Set parseSizes(final String sizesAttr) { + final Set sizes = new HashSet<>(); + final Set tokens = parseSpaceSeparatedTokens(sizesAttr); + for (final String token : tokens) { /* * "any" keyword may be present, but doesn't have to produce a * dimension result */ if (token != null) { - Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token); + final Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token); if (matcher.matches()) { /* With given pattern no NumberFormatException can occur */ sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2)))); @@ -702,30 +703,30 @@ public class ContentScraper extends AbstractScraper implements Scraper { * Parses a space separated tokens attribute value (see * http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens). * Eventual duplicates are removed. - * + * * @param attr * attribute string, may be null * @return a set of tokens eventually empty */ public static Set parseSpaceSeparatedTokens(final String attr) { - Set tokens = new HashSet<>(); + final Set tokens = new HashSet<>(); /* Check attr string is not empty to avoid adding a single empty string * in result */ if (attr != null && !attr.trim().isEmpty()) { - String[] items = attr.trim().split(CommonPattern.SPACES.pattern()); + final String[] items = attr.trim().split(CommonPattern.SPACES.pattern()); Collections.addAll(tokens, items); } return tokens; } - + /** * Retain only icon relations (standard and non standard) from tokens . * @param relTokens relationship tokens (parsed from a rel attribute) * @return a Set of icon relations, eventually empty */ - public Set retainIconRelations(Collection relTokens) { - HashSet iconRels = new HashSet<>(); - for(String token : relTokens) { + public Set retainIconRelations(final Collection relTokens) { + final HashSet iconRels = new HashSet<>(); + for(final String token : relTokens) { if(IconLinkRelations.isIconRel(token)) { iconRels.add(token.toLowerCase(Locale.ENGLISH)); } @@ -803,7 +804,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String href = tag.opts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { tag.opts.put("name", areatitle); - AnchorURL url = absolutePath(href); + final AnchorURL url = absolutePath(href); if(url != null) { tag.opts.put("href", url.toNormalform(true)); url.setAll(tag.opts); @@ -816,21 +817,21 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (newLink != null) { tag.opts.put("href", newLink.toNormalform(true)); - String rel = tag.opts.getProperty("rel", EMPTY_STRING); + final String rel = tag.opts.getProperty("rel", EMPTY_STRING); /* Rel attribute is supposed to be a set of space-separated tokens */ - Set relTokens = parseSpaceSeparatedTokens(rel); + final Set relTokens = parseSpaceSeparatedTokens(rel); final String linktitle = tag.opts.getProperty("title", EMPTY_STRING); final String type = tag.opts.getProperty("type", EMPTY_STRING); final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING); - Set iconRels = retainIconRelations(relTokens); + final Set iconRels = retainIconRelations(relTokens); /* Distinguish icons from images. It will enable for example to later search only images and no icons */ if (!iconRels.isEmpty()) { - String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING); - Set sizes = parseSizes(sizesAttr); + final String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING); + final Set sizes = parseSizes(sizesAttr); IconEntry icon = this.icons.get(newLink); - /* There is already an icon with same URL for this document : + /* There is already an icon with same URL for this document : * they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */ if(icon != null) { icon.getRel().addAll(iconRels); @@ -880,7 +881,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if(tag.name.equalsIgnoreCase("param")) { final String name = tag.opts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING)); + final AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING)); if(url != null) { tag.opts.put("value", url.toNormalform(true)); url.setAll(tag.opts); @@ -918,7 +919,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { checkOpts(tag); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { - String href = tag.opts.getProperty("href", EMPTY_STRING); + final String href = tag.opts.getProperty("href", EMPTY_STRING); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if (followDenied()) { @@ -939,7 +940,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.divid, id); final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { - breadcrumbs++; + this.breadcrumbs++; } } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); @@ -990,7 +991,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (tag.name.equalsIgnoreCase("script")) { final String src = tag.opts.getProperty("src", EMPTY_STRING); if (src.length() > 0) { - AnchorURL absoluteSrc = absolutePath(src); + final AnchorURL absoluteSrc = absolutePath(src); if(absoluteSrc != null) { this.script.add(absoluteSrc); } @@ -1005,16 +1006,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { h = tag.opts.getProperty("datetime"); // TODO: checkOpts() also parses datetime property if in combination with schema.org itemprop=startDate/endDate if (h != null) { // datetime property is optional try { - Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime(); + final Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime(); this.startDates.add(startDate); - } catch (ParseException ex) { } + } catch (final ParseException ex) { } } } // fire event this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars()); } - + /** * Scraping operation applied to any kind of tag opening, being either singleton * or paired tag, not restricted to tags listed in @@ -1030,11 +1031,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts)); } } - + @Override public TagValency tagValency(final Tag tag, final Tag parentTag) { if (parentTag != null && parentTag.tv != this.defaultValency) return parentTag.tv; - + if (this.valencySwitchTagNames != null && tag != null && (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) { @@ -1044,12 +1045,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } return this.defaultValency; } - + /** * Add an anchor to the anchors list, and trigger any eventual listener * @param anchor anchor to add. Must not be null. */ - protected void addAnchor(AnchorURL anchor) { + protected void addAnchor(final AnchorURL anchor) { if(this.anchors.size() >= this.maxAnchors) { this.maxAnchorsExceeded = true; } else { @@ -1067,7 +1068,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public List getTitles() { // some documents have a title tag as meta tag - String s = this.metas.get("title"); + final String s = this.metas.get("title"); if (s != null && s.length() > 0) { this.titles.add(s); } @@ -1083,7 +1084,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // extract headline from file name - ArrayList t = new ArrayList(); + final ArrayList t = new ArrayList<>(); t.addAll(this.titles); return t; } @@ -1094,7 +1095,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String[] getBold() { - final List a = new ArrayList(); + final List a = new ArrayList<>(); final Iterator i = this.bold.keys(false); while (i.hasNext()) a.add(i.next()); return a.toArray(new String[a.size()]); @@ -1107,7 +1108,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String[] getItalic() { - final List a = new ArrayList(); + final List a = new ArrayList<>(); final Iterator i = this.italic.keys(false); while (i.hasNext()) a.add(i.next()); return a.toArray(new String[a.size()]); @@ -1120,7 +1121,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String[] getUnderline() { - final List a = new ArrayList(); + final List a = new ArrayList<>(); final Iterator i = this.underline.keys(false); while (i.hasNext()) a.add(i.next()); return a.toArray(new String[a.size()]); @@ -1143,18 +1144,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String[] getDd() { return this.dd.toArray(new String[this.dd.size()]); } - + public List getStartDates() { return this.startDates; } - + public List getEndDates() { return this.endDates; } public DigestURL[] getFlash() { String ext; - ArrayList f = new ArrayList(); + final ArrayList f = new ArrayList<>(); for (final DigestURL url: this.anchors) { ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext == null) continue; @@ -1176,7 +1177,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public int breadcrumbCount() { return this.breadcrumbs; } - + public String getText() { try { return this.content.trim().toString(); @@ -1214,7 +1215,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // returns a url (String) / name (String) relation return this.iframes; } - + /** * @return URLs of linked data item types referenced from HTML content with standard * annotations such as RDFa, microdata, microformats or JSON-LD @@ -1234,11 +1235,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { public DigestURL getPublisherLink() { return this.publisher; } - + public Map getHreflang() { return this.hreflang; } - + public Map getNavigation() { return this.navigation; } @@ -1265,28 +1266,28 @@ public class ContentScraper extends AbstractScraper implements Scraper { public Map getIcons() { return this.icons; } - + /** * @return true when the limit on content size scraped has been exceeded */ public boolean isContentSizeLimitExceeded() { return this.contentSizeLimitExceeded; } - + /** * @param contentSizeLimitExceeded set to true when a limit on content size scraped has been exceeded */ public void setContentSizeLimitExceeded(final boolean contentSizeLimitExceeded) { this.contentSizeLimitExceeded = contentSizeLimitExceeded; } - + /** * @return true when the maxAnchors limit has been exceeded */ public boolean isMaxAnchorsExceeded() { return this.maxAnchorsExceeded; } - + /** * @return true when at least one limit on content size, anchors number or links number has been exceeded */ @@ -1296,7 +1297,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { || this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded() || this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded(); } - + /* DC in html example: @@ -1312,7 +1313,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (s.indexOf("noindex",0) >= 0) return true; return false; } - + public boolean followDenied() { final String s = this.metas.get("robots"); if (s == null) return false; @@ -1323,7 +1324,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public List getDescriptions() { String s = this.metas.get("description"); if (s == null) s = this.metas.get("dc.description"); - List descriptions = new ArrayList(); + final List descriptions = new ArrayList<>(); if (s == null) return descriptions; descriptions.add(s); return descriptions; @@ -1351,14 +1352,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final static Pattern commaSepPattern = Pattern.compile(" |,"); private final static Pattern semicSepPattern = Pattern.compile(" |;"); - + public Set getContentLanguages() { // i.e. // or String s = this.metas.get("content-language"); if (s == null) s = this.metas.get("dc.language"); if (s == null) return null; - final Set hs = new HashSet(); + final Set hs = new HashSet<>(); final String[] cl = commaSepPattern.split(s); int p; for (int i = 0; i < cl.length; i++) { @@ -1378,9 +1379,32 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (s.isEmpty()) { return new String[0]; } - if (s.contains(",")) return commaSepPattern.split(s); - if (s.contains(";")) return semicSepPattern.split(s); - return s.split("\\s"); + String[] k = null; + if (s.contains(",")) + k = commaSepPattern.split(s); + else if (s.contains(";")) + k = semicSepPattern.split(s); + else + k = s.split("\\s"); + + // trim the Strings + for (int i = 0; i < k.length; i++) + k[i] = k[i].trim(); + + // remove empty strings + int p = 0; + while (p < k.length) { + if (k[p].length() == 0) { + final String[] k1 = new String[k.length - 1]; + System.arraycopy(k, 0, k1, 0, p); + System.arraycopy(k, p + 1, k1, p, k1.length - p); + k = k1; + } else { + p++; + } + } + + return k; } public int getRefreshSeconds() { @@ -1406,34 +1430,34 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim(); return EMPTY_STRING; } - + public Date getDate() { String content; - + // content = this.metas.get("date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} // content = this.metas.get("dc.date.modified"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} - + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} + // content = this.metas.get("dc.date.created"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} - + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} + // content = this.metas.get("dc.date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} - + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} + // content = this.metas.get("dc:date"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} - + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} + // content = this.metas.get("last-modified"); - if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} - + if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {} + return new Date(); } @@ -1482,7 +1506,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String[] getEvaluationModelScoreNames(final String modelName) { - final List a = new ArrayList(); + final List a = new ArrayList<>(); final ClusteredScoreMap scores = this.evaluationScores.getScores(modelName); if (scores != null) { final Iterator i = scores.keys(false); @@ -1537,7 +1561,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public void print() { - for (String t: this.titles) { + for (final String t: this.titles) { System.out.println("TITLE :" + t); } for (int i = 0; i < 4; i++) { @@ -1596,7 +1620,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } - + /** * Fire addAnchor event to any listener implemening {@link ContentScraperListener} interface * @param url anchor url @@ -1617,11 +1641,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { // scrape document to look up charset final ScraperInputStream htmlFilter = new ScraperInputStream( - new ByteArrayInputStream(page), - StandardCharsets.UTF_8.name(), + new ByteArrayInputStream(page), + StandardCharsets.UTF_8.name(), new HashSet(), TagValency.EVAL, - new VocabularyScraper(), - new DigestURL("http://localhost"), + new VocabularyScraper(), + new DigestURL("http://localhost"), false, maxLinks, timezoneOffset); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); @@ -1629,11 +1653,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { // scrape content final ContentScraper scraper = new ContentScraper( - new DigestURL("http://localhost"), - maxLinks, - new HashSet(), - TagValency.EVAL, - new VocabularyScraper(), + new DigestURL("http://localhost"), + maxLinks, + new HashSet(), + TagValency.EVAL, + new VocabularyScraper(), timezoneOffset); final Writer writer = new TransformerWriter(null, null, scraper, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 0ac5c70e5..0fa509184 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -24,15 +24,15 @@ import java.util.Date; import java.util.List; import java.util.Locale; -import net.yacy.cora.federate.solr.SchemaDeclaration; -import net.yacy.cora.federate.solr.SolrType; - import org.apache.poi.ss.formula.atp.DateParser; import org.apache.poi.ss.formula.eval.EvaluationException; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.SolrType; + public enum CollectionSchema implements SchemaDeclaration { - + // mandatory id(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash **mandatory field**", true), sku(SolrType.string, true, true, false, true, true, "url of document", true), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. @@ -55,7 +55,7 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol", true), // needed to enhance the crawler images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'", true), images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols", true), // for correct assembly of image url images_protocol_sxt + images_urlstub_sxt is needed - + // optional but recommended, part of index distribution fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"), referrer_id_s(SolrType.string, true, true, false, false, false, "id of the referrer to this document, discovered during crawling"),// byte[] referrerHash(); @@ -64,7 +64,7 @@ public enum CollectionSchema implements SchemaDeclaration { audiolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to audio resources"),// int laudio(); videolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to video resources"),// int lvideo(); applinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to application resources"),// int lapp(); - + // optional but recommended title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"), @@ -97,7 +97,7 @@ public enum CollectionSchema implements SchemaDeclaration { description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"), description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"), description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"), - keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"), + keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by comma, semicolon or space"), charset_s(SolrType.string, true, true, false, false, false, "character encoding"), wordcount_i(SolrType.num_integer, true, true, false, false, false, "number of words in visible area"), linkscount_i(SolrType.num_integer, true, true, false, false, false, "number of all outgoing links; including linksnofollowcount_i"), @@ -116,7 +116,7 @@ public enum CollectionSchema implements SchemaDeclaration { h4_txt(SolrType.text_general, true, true, true, false, true, "h4 header"), h5_txt(SolrType.text_general, true, true, true, false, true, "h5 header"), h6_txt(SolrType.text_general, true, true, true, false, true, "h6 header"), - + // unused, delete candidates @Deprecated md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5(); @@ -150,13 +150,13 @@ public enum CollectionSchema implements SchemaDeclaration { metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of tag"), inboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "internal links, the visible anchor text"), outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"), - + icons_urlstub_sxt(SolrType.string, true, true, true, false, true, "all icon links without the protocol and '://'"), /** All icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored */ icons_protocol_sxt(SolrType.string, true, true, true, false, false, "all icon links protocols"), icons_rel_sxt(SolrType.string, true, true, true, false, false, "all icon links relationships space separated (e.g.. 'icon apple-touch-icon')"), icons_sizes_sxt(SolrType.string, true, true, true, false, false, "all icon sizes space separated (e.g. '16x16 32x32')"), - + images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_alt_sxt(SolrType.string, true, true, true, false, true, "all image link alt tag"), // no need to index this; don't turn it into a txt field; use images_text_t instead images_height_val(SolrType.num_integer, true, true, true, false, false, "size of images:height"), @@ -192,7 +192,7 @@ public enum CollectionSchema implements SchemaDeclaration { navigation_url_sxt(SolrType.string, true, true, true, false, false, "page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"), navigation_type_sxt(SolrType.string, true, true, true, false, false, "page navigation rel property value, can contain one of {top,up,next,prev,first,last}"), publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"), - + url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"), url_file_name_s(SolrType.string, true, true, false, false, true, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"), url_file_name_tokens_t(SolrType.text_general, true, true, false, false, true, "tokens generated from url_file_name_s which can be used for better matching and result boosting"), @@ -228,15 +228,15 @@ public enum CollectionSchema implements SchemaDeclaration { opengraph_type_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"), opengraph_url_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"), opengraph_image_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"), - + // link structure for ranking cr_host_count_i(SolrType.num_integer, true, true, false, false, false, "the number of documents within a single host"), cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"), cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"), - + // custom rating; values to influence the ranking in combination with boost rules rating_i(SolrType.num_integer, true, true, false, false, false, "custom rating; to be set with external rating information"), - + // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"), italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"), @@ -254,7 +254,7 @@ public enum CollectionSchema implements SchemaDeclaration { ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"), ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"), vocabularies_sxt(SolrType.string, true, true, true, false, false, "collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies"); - + public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0 public final static String VOCABULARY_PREFIX = "vocabulary_"; // collects all terms that appear for each vocabulary @@ -262,15 +262,15 @@ public enum CollectionSchema implements SchemaDeclaration { public final static String VOCABULARY_COUNT_SUFFIX = "_i"; // suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name public final static String VOCABULARY_LOGCOUNT_SUFFIX = "_log_i"; // log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences public final static String VOCABULARY_LOGCOUNTS_SUFFIX = "_log_val"; // all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences - + private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type; private final boolean indexed, stored, searchable, multiValued, omitNorms, docValues; private String comment; - + /** When true, the field must be enabled for proper YaCy operation */ private boolean mandatory = false; - + private CollectionSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final boolean searchable, final String comment) { this(type, indexed, stored, multiValued, omitNorms, searchable, comment, false); } @@ -286,10 +286,10 @@ public enum CollectionSchema implements SchemaDeclaration { this.mandatory = mandatory; this.docValues = (type == SolrType.string || type == SolrType.date || type.name().startsWith("num_")); // verify our naming scheme - String name = this.name(); - int p = name.indexOf('_'); + final String name = this.name(); + final int p = name.indexOf('_'); if (p > 0) { - String ext = name.substring(p + 1); + final String ext = name.substring(p + 1); assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name; assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name; assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name; @@ -305,7 +305,7 @@ public enum CollectionSchema implements SchemaDeclaration { } assert type.appropriateName(this) : "bad configuration: " + this.name(); } - + /** * Returns the YaCy default or (if available) custom field name for Solr * @return SolrFieldname String @@ -320,7 +320,7 @@ public enum CollectionSchema implements SchemaDeclaration { * @param theValue = the field name */ @Override - public final void setSolrFieldName(String theValue) { + public final void setSolrFieldName(final String theValue) { // make sure no empty string is assigned if ( (theValue != null) && (!theValue.isEmpty()) ) { this.solrFieldName = theValue.toLowerCase(Locale.ROOT); @@ -358,7 +358,7 @@ public enum CollectionSchema implements SchemaDeclaration { public final boolean isSearchable() { return this.searchable; } - + @Override public boolean isDocValue() { return this.docValues; @@ -368,12 +368,12 @@ public enum CollectionSchema implements SchemaDeclaration { public final String getComment() { return this.comment; } - + @Override public final boolean isMandatory() { return this.mandatory; } - + @Override public final void add(final SolrInputDocument doc, final String value) { assert !this.isMultiValued(); @@ -444,11 +444,11 @@ public enum CollectionSchema implements SchemaDeclaration { } else if (this.type == SolrType.date) { assert (value.iterator().next() instanceof String) || (value.iterator().next() instanceof Date) : "type: " + value.iterator().next().getClass().getName(); if (value.iterator().next() instanceof String) { - Date[] da = new Date[value.size()]; + final Date[] da = new Date[value.size()]; for (int i = 0; i < value.size(); i++) { try { da[i] = DateParser.parseDate((String) value.get(i)).getTime(); - } catch (EvaluationException e) { + } catch (final EvaluationException e) { da[i] = null; } }