- bugfixes in html parser

- new fields in solr
- extended file viewer to debug parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7897 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 2c595a6a47
commit 5dd2efc9a2

@ -73,7 +73,12 @@ text_t
wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks
attr_inboundlinks_tag
attr_inboundlinks_protocol
attr_inboundlinks_urlstub
attr_inboundlinks_name
attr_inboundlinks_rel
attr_inboundlinks_text
## total number of inbound links, int
inboundlinkscount_i
@ -82,7 +87,12 @@ inboundlinkscount_i
inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks
attr_outboundlinks_tag
attr_outboundlinks_protocol
attr_outboundlinks_urlstub
attr_outboundlinks_name
attr_outboundlinks_rel
attr_outboundlinks_text
## total number of external links, int
outboundlinkscount_i

@ -115,13 +115,22 @@
:: <!-- 6 -->
<fieldset><legend>Link List</legend>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="head">
<td>nr</td>
<td class="tt">type</td>
<td class="tt">name</td>
<td class="tt">link</td>
<td class="tt">text</td>
<td class="tt">rel</td>
</tr>
#{links}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td>#[nr]#</td>
<td class="tt"><tt>#[type]#</tt></td>
<td class="tt"><tt>#[text]#</tt></td>
<td class="tt"><tt>#[name]#</tt></td>
<td class="tt"><tt><a href="#[url]#">#[link]#</a></tt></td>
<td class="tt"><tt>#[attr]#</tt></td>
<td class="tt"><tt>#[text]#</tt></td>
<td class="tt"><tt>#[rel]#</tt></td>
</tr>#{/links}#
</table>
</fieldset>

@ -36,6 +36,7 @@ import java.util.Collection;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -309,8 +310,8 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
boolean dark = true;
int i = 0;
i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors());
dark = (i % 2 == 0);
final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
@ -324,15 +325,17 @@ public class ViewFile {
prop.put("viewMode_links_" + i + "_text", (entry.alt().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.alt()));
prop.put("viewMode_links_" + i + "_url", entry.url().toNormalform(false, true));
prop.put("viewMode_links_" + i + "_link", markup(wordArray, entry.url().toNormalform(false, true)));
if (entry.width() > 0 && entry.height() > 0)
prop.put("viewMode_links_" + i + "_attr", entry.width() + "x" + entry.height() + " Pixel");
else
prop.put("viewMode_links_" + i + "_attr", "unknown");
if (entry.width() > 0 && entry.height() > 0) {
prop.put("viewMode_links_" + i + "_rel", entry.width() + "x" + entry.height() + " Pixel");
} else {
prop.put("viewMode_links_" + i + "_rel", "");
}
prop.put("viewMode_links_" + i + "_name", "");
dark = !dark;
i++;
}
i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors());
prop.put("viewMode_links", i);
}
@ -382,16 +385,29 @@ public class ViewFile {
return message;
}
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
private static int putMediaInfo(
final serverObjects prop,
final String[] wordArray,
int c,
final Map<MultiProtocolURI, String> media,
final String type,
boolean dark,
final Map<MultiProtocolURI, Properties> alllinks) {
int i = 0;
for (final Map.Entry<MultiProtocolURI, String> entry : media.entrySet()) {
final Properties p = alllinks.get(entry.getKey());
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
prop.put("viewMode_links_" + c + "_nr", c);
prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));
prop.putHTML("viewMode_links_" + c + "_type", name);
prop.put("viewMode_links_" + c + "_text", ((entry.getValue().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.getValue()) ));
prop.putHTML("viewMode_links_" + c + "_type", type);
prop.put("viewMode_links_" + c + "_text", text + "/" + ((entry.getValue().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.getValue()) ));
prop.put("viewMode_links_" + c + "_link", markup(wordArray, entry.getKey().toNormalform(true, false)));
prop.put("viewMode_links_" + c + "_url", entry.getKey().toNormalform(true, false));
prop.put("viewMode_links_" + c + "_attr", "&nbsp;");
prop.put("viewMode_links_" + c + "_rel", rel);
prop.put("viewMode_links_" + c + "_name", name);
dark = !dark;
c++;
i++;

@ -129,36 +129,76 @@ public class SolrScheme extends ConfigurationSet {
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains("attr_inboundlinks")) {
final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", "");
inboundlinks[c++] =
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? rel : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
c++;
}
addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
}
c = 0;
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains("attr_outboundlinks")) {
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", "");
outboundlinks[c++] =
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? rel : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
c++;
}
addSolr(solrdoc, "attr_outboundlinks", outboundlinks);
addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
}
// charset
addSolr(solrdoc, "charset_s", yacydoc.getCharset());

@ -242,8 +242,9 @@ public class SolrSingleConnector {
}
protected void addSolr(final Collection<SolrInputDocument> docs) throws IOException, SolrException {
try {
this.server.add(docs);
if (docs.size() != 0) this.server.add(docs);
this.server.commit();
/* To immediately commit after adding documents, you could use:
UpdateRequest req = new UpdateRequest();

@ -37,6 +37,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
@ -172,6 +173,36 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.canonical = null;
}
private void mergeAnchors(final MultiProtocolURI url, final Properties p) {
final Properties p0 = this.anchors.get(url);
if (p0 == null) {
this.anchors.put(url, p);
return;
}
// merge properties
for (final Entry<Object, Object> entry: p.entrySet()) {
if (entry.getValue() != null && entry.getValue().toString().length() > 0) p0.put(entry.getKey(), entry.getValue());
}
this.anchors.put(url, p0);
}
/*
private void mergeAnchors(final MultiProtocolURI url, final String key, final String value) {
if (value == null) return;
if (value.length() == 0) return;
Properties p0 = this.anchors.get(url);
if (p0 == null) {
p0 = new Properties();
p0.put(key, value);
this.anchors.put(url, p0);
return;
}
// merge properties
p0.put(key, value);
this.anchors.put(url, p0);
}
*/
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, pl, q, s = 0;
@ -258,7 +289,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = p + 6;
try {
url = new MultiProtocolURI(u);
this.anchors.put(url, new Properties());
mergeAnchors(url, new Properties());
continue;
} catch (final MalformedURLException e) {}
}
@ -306,7 +337,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
this.anchors.put(src, tagopts /* with property "name" */);
mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("body")) {
@ -333,8 +364,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", "");
final Properties p = new Properties(); p.put("name", areatitle);
if (href.length() > 0) this.anchors.put(absolutePath(href), p);
tagopts.put("nme", areatitle);
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
} else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", "");
final MultiProtocolURI newLink = absolutePath(href);
@ -349,8 +380,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
final Properties p = new Properties(); p.put("name", this.title);
this.anchors.put(newLink, p);
tagopts.put("name", this.title);
mergeAnchors(newLink, tagopts);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
this.rss.put(newLink, linktitle);
@ -358,16 +389,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css.put(newLink, rel);
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
final Properties p = new Properties(); p.put("name", linktitle);
this.anchors.put(newLink, p);
tagopts.put("name", linktitle);
mergeAnchors(newLink, tagopts);
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
this.anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", "");
if (name.equalsIgnoreCase("movie")) {
this.anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
}
}
@ -389,8 +420,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(this.images, ie);
} else {
tagopts.put("name", recursiveParse(text));
this.anchors.put(url, tagopts);
tagopts.put("text", recursiveParse(text));
mergeAnchors(url, tagopts);
}
}
this.evaluationScores.match(Element.apath, href);
@ -430,7 +461,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
this.anchors.put(src, tagopts /* with property "name" */);
mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) {
@ -466,7 +497,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
Log.logException(e);
return cleanLine(super.stripAll(inlineHtml));
}
this.anchors.putAll(scraper.getAnchors());
for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
mergeAnchors(entry.getKey(), entry.getValue());
}
this.images.putAll(scraper.images);
return cleanLine(super.stripAll(scraper.content.getChars()));
@ -559,7 +592,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public boolean containsFlash() {
this.anchors = new HashMap<MultiProtocolURI, Properties>();
String ext;
for (final MultiProtocolURI url: this.anchors.keySet()) {
ext = url.getFileExtension();

@ -199,15 +199,51 @@ public final class TransformerWriter extends Writer {
return result;
}
private static final char[] mergedScrape(final char[] a, final char[] b) {
if (a == null) return b;
if (b == null) return a;
final StringBuilder sb = new StringBuilder(a.length + b.length + 1);
sb.append(a).append(' ').append(b);
return sb.toString().toCharArray();
}
/**
* every tag that appears is handed to the filterTag method. The method then returns text from the tag
* but also operates on the tag content to scrape information from it. In case that a tag is unclosed if
* another tag appears, both, the unclosed and the new one are merged into one new char[]
* @param tag
* @param opening
* @param content
* @param quotechar
* @return
*/
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
//System.out.println("FILTER1: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
char[] unclosed = null;
if (this.filterTag != null && opening) {
// there is a missing close tag for the currently parsed tag filterTag
// close that tag here and go on with new tag
if (this.scraper != null) {
this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
}
if (this.transformer != null) {
unclosed = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {
unclosed = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
}
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
}
if (this.filterTag == null) {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (this.scraper != null) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
if (this.transformer != null) return mergedScrape(unclosed, this.transformer.transformText(content));
return mergedScrape(unclosed, content);
}
// we have a new tag
@ -227,7 +263,7 @@ public final class TransformerWriter extends Writer {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(content);
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
return mergedScrape(unclosed, this.transformer.transformTag0(tag, scb.propParser(), quotechar));
} finally {
try {
scb.close();
@ -247,15 +283,15 @@ public final class TransformerWriter extends Writer {
Log.logException(e);
}
if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
return mergedScrape(unclosed, new char[0]);
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
return mergedScrape(unclosed, genTag0raw(tag, true, content));
}
}
// we ignore that thing and return it again
return genTag0raw(tag, false, content);
return mergedScrape(unclosed, genTag0raw(tag, false, content));
}
@ -270,14 +306,14 @@ public final class TransformerWriter extends Writer {
this.filterCont.append(content);
}
} catch (final OutOfMemoryError e) {}
return new char[0];
return mergedScrape(unclosed, new char[0]);
}
// it's a tag! which one?
if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
if (opening || !(tag.equalsIgnoreCase(this.filterTag))) {
// this tag is not our concern. just add it
this.filterCont.append(genTag0raw(tag, opening, content));
return new char[0];
return mergedScrape(unclosed, new char[0]);
}
// it's our closing tag! return complete result.
@ -291,7 +327,7 @@ public final class TransformerWriter extends Writer {
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
return ret;
return mergedScrape(unclosed, ret);
}
private char[] filterFinalize(final char quotechar) {
@ -301,7 +337,9 @@ public final class TransformerWriter extends Writer {
// it's our closing tag! return complete result.
char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.scraper != null) {
this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
}
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {

Loading…
Cancel
Save