diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index dcf7d5434..78a3087f0 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -73,7 +73,12 @@ text_t wordcount_i ## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -attr_inboundlinks +attr_inboundlinks_tag +attr_inboundlinks_protocol +attr_inboundlinks_urlstub +attr_inboundlinks_name +attr_inboundlinks_rel +attr_inboundlinks_text ## total number of inbound links, int inboundlinkscount_i @@ -82,7 +87,12 @@ inboundlinkscount_i inboundlinksnoindexcount_i ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -attr_outboundlinks +attr_outboundlinks_tag +attr_outboundlinks_protocol +attr_outboundlinks_urlstub +attr_outboundlinks_name +attr_outboundlinks_rel +attr_outboundlinks_text ## total number of external links, int outboundlinkscount_i diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index de76cffbc..0f4a511f7 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -115,13 +115,22 @@ ::
Link List + + + + + + + + #{links}# - + - + + #{/links}#
nrtypenamelinktextrel
#[nr]# #[type]##[text]##[name]# #[link]##[attr]##[text]##[rel]#
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 2113854a7..1d378b7c2 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -36,6 +36,7 @@ import java.util.Collection; import java.util.Enumeration; import java.util.Iterator; import java.util.Map; +import java.util.Properties; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -309,8 +310,8 @@ public class ViewFile { prop.put("viewMode", VIEW_MODE_AS_LINKLIST); boolean dark = true; int i = 0; - i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0)); - i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); + i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors()); + i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors()); dark = (i % 2 == 0); final Map ts = document.getImages(); @@ -324,15 +325,17 @@ public class ViewFile { prop.put("viewMode_links_" + i + "_text", (entry.alt().isEmpty()) ? " " : markup(wordArray, entry.alt())); prop.put("viewMode_links_" + i + "_url", entry.url().toNormalform(false, true)); prop.put("viewMode_links_" + i + "_link", markup(wordArray, entry.url().toNormalform(false, true))); - if (entry.width() > 0 && entry.height() > 0) - prop.put("viewMode_links_" + i + "_attr", entry.width() + "x" + entry.height() + " Pixel"); - else - prop.put("viewMode_links_" + i + "_attr", "unknown"); + if (entry.width() > 0 && entry.height() > 0) { + prop.put("viewMode_links_" + i + "_rel", entry.width() + "x" + entry.height() + " Pixel"); + } else { + prop.put("viewMode_links_" + i + "_rel", ""); + } + prop.put("viewMode_links_" + i + "_name", ""); dark = !dark; i++; } - i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0)); - i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0)); + i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors()); + i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors()); prop.put("viewMode_links", i); } @@ -382,16 +385,29 @@ public class ViewFile { return message; } - private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map media, final String name, boolean dark) { + private static int putMediaInfo( + final serverObjects prop, + final String[] wordArray, + int c, + final Map media, + final String type, + boolean dark, + final Map alllinks) { int i = 0; for (final Map.Entry entry : media.entrySet()) { + final Properties p = alllinks.get(entry.getKey()); + final String name = p.getProperty("name", ""); // the name attribute + final String rel = p.getProperty("rel", ""); // the rel-attribute + final String text = p.getProperty("text", ""); // the text between the
tag + prop.put("viewMode_links_" + c + "_nr", c); prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0)); - prop.putHTML("viewMode_links_" + c + "_type", name); - prop.put("viewMode_links_" + c + "_text", ((entry.getValue().isEmpty()) ? " " : markup(wordArray, entry.getValue()) )); + prop.putHTML("viewMode_links_" + c + "_type", type); + prop.put("viewMode_links_" + c + "_text", text + "/" + ((entry.getValue().isEmpty()) ? " " : markup(wordArray, entry.getValue()) )); prop.put("viewMode_links_" + c + "_link", markup(wordArray, entry.getKey().toNormalform(true, false))); prop.put("viewMode_links_" + c + "_url", entry.getKey().toNormalform(true, false)); - prop.put("viewMode_links_" + c + "_attr", " "); + prop.put("viewMode_links_" + c + "_rel", rel); + prop.put("viewMode_links_" + c + "_name", name); dark = !dark; c++; i++; diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 868d0a9d1..063780ae7 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -129,36 +129,76 @@ public class SolrScheme extends ConfigurationSet { if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount()); if (isEmpty() || contains("attr_inboundlinks")) { - final String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()]; + final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.inboundLinks()) { final Properties p = alllinks.get(url); - final String name = p.getProperty("name", ""); - final String rel = p.getProperty("rel", ""); - inboundlinks[c++] = + final String name = p.getProperty("name", ""); // the name attribute + final String rel = p.getProperty("rel", ""); // the rel-attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String urls = url.toNormalform(false, false); + final int pr = urls.indexOf("://"); + inboundlinksURLProtocol[c] = urls.substring(0, pr); + inboundlinksURLStub[c] = urls.substring(pr + 3); + inboundlinksName[c] = name.length() > 0 ? name : ""; + inboundlinksRel[c] = rel.length() > 0 ? rel : ""; + inboundlinksText[c] = text.length() > 0 ? rel : ""; + inboundlinksTag[c] = " 0 ? " rel=\"" + rel + "\"" : "") + ">" + ((name.length() > 0) ? name : "") + ""; + c++; } - addSolr(solrdoc, "attr_inboundlinks", inboundlinks); + addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag); + addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol); + addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub); + addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName); + addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel); + addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText); } + c = 0; if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount()); if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount()); if (isEmpty() || contains("attr_outboundlinks")) { - final String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()]; + final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.outboundLinks()) { final Properties p = alllinks.get(url); - final String name = p.getProperty("name", ""); - final String rel = p.getProperty("rel", ""); - outboundlinks[c++] = + final String name = p.getProperty("name", ""); // the name attribute + final String rel = p.getProperty("rel", ""); // the rel-attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String urls = url.toNormalform(false, false); + final int pr = urls.indexOf("://"); + outboundlinksURLProtocol[c] = urls.substring(0, pr); + outboundlinksURLStub[c] = urls.substring(pr + 3); + outboundlinksName[c] = name.length() > 0 ? name : ""; + outboundlinksRel[c] = rel.length() > 0 ? rel : ""; + outboundlinksText[c] = text.length() > 0 ? rel : ""; + outboundlinksTag[c] = " 0 ? " rel=\"" + rel + "\"" : "") + ">" + ((name.length() > 0) ? name : "") + ""; + c++; } - addSolr(solrdoc, "attr_outboundlinks", outboundlinks); + addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag); + addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol); + addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub); + addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName); + addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel); + addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText); } + // charset addSolr(solrdoc, "charset_s", yacydoc.getCharset()); diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index c429d4b66..e75ee0fd5 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -242,8 +242,9 @@ public class SolrSingleConnector { } protected void addSolr(final Collection docs) throws IOException, SolrException { + try { - this.server.add(docs); + if (docs.size() != 0) this.server.add(docs); this.server.commit(); /* To immediately commit after adding documents, you could use: UpdateRequest req = new UpdateRequest(); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index d013b25f6..53755465b 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -37,6 +37,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; @@ -172,6 +173,36 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.canonical = null; } + private void mergeAnchors(final MultiProtocolURI url, final Properties p) { + final Properties p0 = this.anchors.get(url); + if (p0 == null) { + this.anchors.put(url, p); + return; + } + // merge properties + for (final Entry entry: p.entrySet()) { + if (entry.getValue() != null && entry.getValue().toString().length() > 0) p0.put(entry.getKey(), entry.getValue()); + } + this.anchors.put(url, p0); + } + + /* + private void mergeAnchors(final MultiProtocolURI url, final String key, final String value) { + if (value == null) return; + if (value.length() == 0) return; + Properties p0 = this.anchors.get(url); + if (p0 == null) { + p0 = new Properties(); + p0.put(key, value); + this.anchors.put(url, p0); + return; + } + // merge properties + p0.put(key, value); + this.anchors.put(url, p0); + } + */ + public void scrapeText(final char[] newtext, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); int p, pl, q, s = 0; @@ -258,7 +289,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { s = p + 6; try { url = new MultiProtocolURI(u); - this.anchors.put(url, new Properties()); + mergeAnchors(url, new Properties()); continue; } catch (final MalformedURLException e) {} } @@ -306,7 +337,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } catch (final MalformedURLException e) {} } else if (tagname.equalsIgnoreCase("frame")) { final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); - this.anchors.put(src, tagopts /* with property "name" */); + mergeAnchors(src, tagopts /* with property "name" */); this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true, false)); } else if (tagname.equalsIgnoreCase("body")) { @@ -333,8 +364,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String areatitle = cleanLine(tagopts.getProperty("title","")); //String alt = tagopts.getProperty("alt",""); final String href = tagopts.getProperty("href", ""); - final Properties p = new Properties(); p.put("name", areatitle); - if (href.length() > 0) this.anchors.put(absolutePath(href), p); + tagopts.put("nme", areatitle); + if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts); } else if (tagname.equalsIgnoreCase("link")) { final String href = tagopts.getProperty("href", ""); final MultiProtocolURI newLink = absolutePath(href); @@ -349,8 +380,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.images.put(ie.url(), ie); this.favicon = newLink; } else if (rel.equalsIgnoreCase("canonical")) { - final Properties p = new Properties(); p.put("name", this.title); - this.anchors.put(newLink, p); + tagopts.put("name", this.title); + mergeAnchors(newLink, tagopts); this.canonical = newLink; } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) { this.rss.put(newLink, linktitle); @@ -358,16 +389,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.css.put(newLink, rel); this.evaluationScores.match(Element.csspath, href); } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { - final Properties p = new Properties(); p.put("name", linktitle); - this.anchors.put(newLink, p); + tagopts.put("name", linktitle); + mergeAnchors(newLink, tagopts); } } } else if(tagname.equalsIgnoreCase("embed")) { - this.anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); + mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); } else if(tagname.equalsIgnoreCase("param")) { final String name = tagopts.getProperty("name", ""); if (name.equalsIgnoreCase("movie")) { - this.anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */); + mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */); } } @@ -389,8 +420,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1); addImage(this.images, ie); } else { - tagopts.put("name", recursiveParse(text)); - this.anchors.put(url, tagopts); + tagopts.put("text", recursiveParse(text)); + mergeAnchors(url, tagopts); } } this.evaluationScores.match(Element.apath, href); @@ -430,7 +461,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (h.length() > 0) this.li.add(h); } else if (tagname.equalsIgnoreCase("iframe")) { final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); - this.anchors.put(src, tagopts /* with property "name" */); + mergeAnchors(src, tagopts /* with property "name" */); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false)); } else if (tagname.equalsIgnoreCase("script")) { @@ -466,7 +497,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { Log.logException(e); return cleanLine(super.stripAll(inlineHtml)); } - this.anchors.putAll(scraper.getAnchors()); + for (final Map.Entry entry: scraper.getAnchors().entrySet()) { + mergeAnchors(entry.getKey(), entry.getValue()); + } this.images.putAll(scraper.images); return cleanLine(super.stripAll(scraper.content.getChars())); @@ -559,7 +592,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public boolean containsFlash() { - this.anchors = new HashMap(); String ext; for (final MultiProtocolURI url: this.anchors.keySet()) { ext = url.getFileExtension(); diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 77d50833f..11c2cb606 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -199,15 +199,51 @@ public final class TransformerWriter extends Writer { return result; } + private static final char[] mergedScrape(final char[] a, final char[] b) { + if (a == null) return b; + if (b == null) return a; + final StringBuilder sb = new StringBuilder(a.length + b.length + 1); + sb.append(a).append(' ').append(b); + return sb.toString().toCharArray(); + } + + /** + * every tag that appears is handed to the filterTag method. The method then returns text from the tag + * but also operates on the tag content to scrape information from it. In case that a tag is unclosed if + * another tag appears, both, the unclosed and the new one are merged into one new char[] + * @param tag + * @param opening + * @param content + * @param quotechar + * @return + */ private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) { -// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug + //System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug + char[] unclosed = null; + + if (this.filterTag != null && opening) { + // there is a missing close tag for the currently parsed tag filterTag + // close that tag here and go on with new tag + if (this.scraper != null) { + this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); + } + if (this.transformer != null) { + unclosed = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + } else { + unclosed = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + } + this.filterTag = null; + this.filterOpts = null; + this.filterCont = null; + } + if (this.filterTag == null) { // we are not collection tag text if (tag == null) { // and this is not a tag opener/closer if (this.scraper != null) this.scraper.scrapeText(content, null); - if (this.transformer != null) return this.transformer.transformText(content); - return content; + if (this.transformer != null) return mergedScrape(unclosed, this.transformer.transformText(content)); + return mergedScrape(unclosed, content); } // we have a new tag @@ -227,7 +263,7 @@ public final class TransformerWriter extends Writer { // this single tag is collected at once here final CharBuffer scb = new CharBuffer(content); try { - return this.transformer.transformTag0(tag, scb.propParser(), quotechar); + return mergedScrape(unclosed, this.transformer.transformTag0(tag, scb.propParser(), quotechar)); } finally { try { scb.close(); @@ -247,15 +283,15 @@ public final class TransformerWriter extends Writer { Log.logException(e); } if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset(); - return new char[0]; + return mergedScrape(unclosed, new char[0]); } else { // we ignore that thing and return it again - return genTag0raw(tag, true, content); + return mergedScrape(unclosed, genTag0raw(tag, true, content)); } } // we ignore that thing and return it again - return genTag0raw(tag, false, content); + return mergedScrape(unclosed, genTag0raw(tag, false, content)); } @@ -270,14 +306,14 @@ public final class TransformerWriter extends Writer { this.filterCont.append(content); } } catch (final OutOfMemoryError e) {} - return new char[0]; + return mergedScrape(unclosed, new char[0]); } // it's a tag! which one? - if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) { + if (opening || !(tag.equalsIgnoreCase(this.filterTag))) { // this tag is not our concern. just add it this.filterCont.append(genTag0raw(tag, opening, content)); - return new char[0]; + return mergedScrape(unclosed, new char[0]); } // it's our closing tag! return complete result. @@ -291,7 +327,7 @@ public final class TransformerWriter extends Writer { this.filterTag = null; this.filterOpts = null; this.filterCont = null; - return ret; + return mergedScrape(unclosed, ret); } private char[] filterFinalize(final char quotechar) { @@ -301,7 +337,9 @@ public final class TransformerWriter extends Writer { // it's our closing tag! return complete result. char[] ret; - if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); + if (this.scraper != null) { + this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); + } if (this.transformer != null) { ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); } else {