- solved problems with backpath normalization

- redesigned in/outbound link handover
- removed iframe links from inbound/outbound in solr scheme
pull/1/head
Michael Peter Christen 13 years ago
parent 5f5ed33ed8
commit 453010bd68

@ -106,7 +106,7 @@ public class webstructure {
prop.put("references_count", 1); prop.put("references_count", 1);
prop.put("references_documents", 1); prop.put("references_documents", 1);
prop.put("references_documents_0_hash", urlhash); prop.put("references_documents_0_hash", urlhash);
prop.put("references_documents_0_count", scraper.inboundLinkCount() + scraper.outboundLinkCount()); prop.put("references_documents_0_count", scraper.inboundLinks().size() + scraper.outboundLinks().size());
prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date()));
prop.put("references_documents_0_urle", url == null ? 0 : 1); prop.put("references_documents_0_urle", url == null ? 0 : 1);
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true, false)); if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true, false));

@ -84,31 +84,31 @@ public class SolrScheme extends ConfigurationSet {
*/ */
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
} }
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) { protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost); if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
} }
@ -308,92 +308,11 @@ public class SolrScheme extends ConfigurationSet {
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths); if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
} }
// list all links // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors(); Set<MultiProtocolURI> inboundLinks = yacydoc.inboundLinks();
int c = 0; Set<MultiProtocolURI> ouboundLinks = yacydoc.outboundLinks();
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
c = 0; int c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, Field.httpstatus_i, 200);
final Object parser = yacydoc.getParserObject(); final Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) { if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser; final ContentScraper html = (ContentScraper) parser;
@ -483,6 +402,8 @@ public class SolrScheme extends ConfigurationSet {
c = 0; c = 0;
for (final ImageEntry ie: imagesc) { for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url(); final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri);
ouboundLinks.remove(uri);
imgtags[c] = ie.toString(); imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol(); imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3); imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
@ -503,6 +424,8 @@ public class SolrScheme extends ConfigurationSet {
c = 0; c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) { for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
final String url = entry.getKey().toNormalform(false, false, false, false); final String url = entry.getKey().toNormalform(false, false, false, false);
inboundLinks.remove(url);
ouboundLinks.remove(url);
css_tag[c] = css_tag[c] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" + "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ url + "\" />"; " href=\""+ url + "\" />";
@ -520,6 +443,8 @@ public class SolrScheme extends ConfigurationSet {
final String[] scripts = new String[scriptss.size()]; final String[] scripts = new String[scriptss.size()];
c = 0; c = 0;
for (final MultiProtocolURI url: scriptss) { for (final MultiProtocolURI url: scriptss) {
inboundLinks.remove(url);
ouboundLinks.remove(url);
scripts[c++] = url.toNormalform(false, false, false, false); scripts[c++] = url.toNormalform(false, false, false, false);
} }
addSolr(solrdoc, Field.scriptscount_i, scripts.length); addSolr(solrdoc, Field.scriptscount_i, scripts.length);
@ -531,21 +456,24 @@ public class SolrScheme extends ConfigurationSet {
final Set<MultiProtocolURI> framess = html.getFrames(); final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()]; final String[] frames = new String[framess.size()];
c = 0; c = 0;
for (final MultiProtocolURI entry: framess) { for (final MultiProtocolURI url: framess) {
frames[c++] = entry.toNormalform(false, false, false, false); inboundLinks.remove(url);
ouboundLinks.remove(url);
frames[c++] = url.toNormalform(false, false, false, false);
} }
addSolr(solrdoc, Field.framesscount_i, frames.length); addSolr(solrdoc, Field.framesscount_i, frames.length);
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames); if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
} }
// IFrames // IFrames
if (isEmpty() || contains(Field.iframes_txt.name() if (isEmpty() || contains(Field.iframes_txt.name())) {
)) {
final Set<MultiProtocolURI> iframess = html.getIFrames(); final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()]; final String[] iframes = new String[iframess.size()];
c = 0; c = 0;
for (final MultiProtocolURI entry: iframess) { for (final MultiProtocolURI url: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false); inboundLinks.remove(url);
ouboundLinks.remove(url);
iframes[c++] = url.toNormalform(false, false, false, false);
} }
addSolr(solrdoc, Field.iframesscount_i, iframes.length); addSolr(solrdoc, Field.iframesscount_i, iframes.length);
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes); if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
@ -568,6 +496,94 @@ public class SolrScheme extends ConfigurationSet {
// response time // response time
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
} }
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
c = 0;
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[inboundLinks.size()];
final String[] inboundlinksURLProtocol = new String[inboundLinks.size()];
final String[] inboundlinksURLStub = new String[inboundLinks.size()];
final String[] inboundlinksName = new String[inboundLinks.size()];
final String[] inboundlinksRel = new String[inboundLinks.size()];
final String[] inboundlinksText = new String[inboundLinks.size()];
for (final MultiProtocolURI url: inboundLinks) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[ouboundLinks.size()];
final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()];
final String[] outboundlinksURLStub = new String[ouboundLinks.size()];
final String[] outboundlinksName = new String[ouboundLinks.size()];
final String[] outboundlinksRel = new String[ouboundLinks.size()];
final String[] outboundlinksText = new String[ouboundLinks.size()];
for (final MultiProtocolURI url: ouboundLinks) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, Field.httpstatus_i, 200);
return solrdoc; return solrdoc;
} }

@ -626,16 +626,6 @@ dc_rights
this.favicon = faviconURL; this.favicon = faviconURL;
} }
public int inboundLinkCount() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? 0 : this.inboundlinks.size();
}
public int outboundLinkCount() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public int inboundLinkNofollowCount() { public int inboundLinkNofollowCount() {
if (this.inboundlinks == null) resortLinks(); if (this.inboundlinks == null) resortLinks();
if (this.inboundlinks == null) return 0; if (this.inboundlinks == null) return 0;

@ -65,8 +65,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final char[] minuteCharsHTML = "&#039;".toCharArray(); private final char[] minuteCharsHTML = "&#039;".toCharArray();
// statics: for initialization of the HTMLFilterAbstractScraper // statics: for initialization of the HTMLFilterAbstractScraper
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f); private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f); private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
public enum TagType { public enum TagType {
singleton, pair; singleton, pair;
@ -119,6 +119,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<MultiProtocolURI, Properties> anchors; private final Map<MultiProtocolURI, Properties> anchors;
private final Map<MultiProtocolURI, String> rss, css; private final Map<MultiProtocolURI, String> rss, css;
private final Set<MultiProtocolURI> script, frames, iframes; private final Set<MultiProtocolURI> script, frames, iframes;
private final Map<MultiProtocolURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas; private final Map<String, String> metas;
private String title; private String title;
@ -159,6 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css = new HashMap<MultiProtocolURI, String>(); this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>(); this.anchors = new HashMap<MultiProtocolURI, Properties>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>(); this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.embeds = new HashMap<MultiProtocolURI, EmbedEntry>();
this.frames = new HashSet<MultiProtocolURI>(); this.frames = new HashSet<MultiProtocolURI>();
this.iframes = new HashSet<MultiProtocolURI>(); this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>(); this.metas = new HashMap<String, String>();
@ -317,11 +319,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (tagname.equalsIgnoreCase("img")) { if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", EMPTY_STRING); final String src = tagopts.getProperty("src", EMPTY_STRING);
try { try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
if (src.length() > 0) { if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src); final MultiProtocolURI url = absolutePath(src);
if (url != null) { if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1); final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
addImage(this.images, ie); addImage(this.images, ie);
} }
@ -334,6 +336,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) { } else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true, false));
mergeAnchors(src, tagopts /* with property "name" */); mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src); this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false)); this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
@ -361,13 +364,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING)); final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
//String alt = tagopts.getProperty("alt",EMPTY_STRING); //String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING); final String href = tagopts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
tagopts.put("nme", areatitle); tagopts.put("nme", areatitle);
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts); MultiProtocolURI url = absolutePath(href);
tagopts.put("href", url.toNormalform(true, false));
mergeAnchors(url, tagopts);
}
} else if (tagname.equalsIgnoreCase("link")) { } else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", EMPTY_STRING); final String href = tagopts.getProperty("href", EMPTY_STRING);
final MultiProtocolURI newLink = absolutePath(href); final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) { if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true, false));
final String rel = tagopts.getProperty("rel", EMPTY_STRING); final String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING); final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING); final String type = tagopts.getProperty("type", EMPTY_STRING);
@ -391,11 +399,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
} else if(tagname.equalsIgnoreCase("embed")) { } else if(tagname.equalsIgnoreCase("embed")) {
mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */); final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
tagopts.put("src", url.toNormalform(true, false));
final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
mergeAnchors(url, tagopts);
}
}
} catch (final NumberFormatException e) {}
} else if(tagname.equalsIgnoreCase("param")) { } else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", EMPTY_STRING); final String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) { if (name.equalsIgnoreCase("movie")) {
mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */); MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
tagopts.put("value", url.toNormalform(true, false));
mergeAnchors(url, tagopts /* with property "name" */);
} }
} }
@ -419,6 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
addImage(this.images, ie); addImage(this.images, ie);
} else { } else {
tagopts.put("text", recursiveParse(text)); tagopts.put("text", recursiveParse(text));
tagopts.put("href", url.toNormalform(true, false)); // we must assign this because the url may have resolved backpaths and may not be absolute
mergeAnchors(url, tagopts); mergeAnchors(url, tagopts);
} }
} }
@ -460,6 +484,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.li.add(h); if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) { } else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true, false));
mergeAnchors(src, tagopts /* with property "name" */); mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src); this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false)); this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
@ -654,10 +679,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @return a map of <urlhash, ImageEntry> * @return a map of <urlhash, ImageEntry>
*/ */
public Map<MultiProtocolURI, ImageEntry> getImages() { public Map<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return this.images; return this.images;
} }
public Map<MultiProtocolURI, EmbedEntry> getEmbeds() {
return this.embeds;
}
public Map<String, String> getMetas() { public Map<String, String> getMetas() {
return this.metas; return this.metas;
} }

@ -388,8 +388,8 @@ public class Segment {
Response.docType(document.dc_format()), // doctype Response.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags condenser.RESULT_FLAGS, // flags
UTF8.getBytes(language), // language UTF8.getBytes(language), // language
document.inboundLinkCount(), // inbound links document.inboundLinks().size(), // inbound links
document.outboundLinkCount(), // outbound links document.outboundLinks().size(), // outbound links
document.getAudiolinks().size(), // laudio document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo document.getVideolinks().size(), // lvideo
@ -409,8 +409,8 @@ public class Segment {
condenser, // document condenser condenser, // document condenser
language, // document language language, // document language
Response.docType(document.dc_format()), // document type Response.docType(document.dc_format()), // document type
document.inboundLinkCount(), // inbound links document.inboundLinks().size(), // inbound links
document.outboundLinkCount(), // outbound links document.outboundLinks().size(), // outbound links
searchEvent, // a search event that can have results directly searchEvent, // a search event that can have results directly
sourceName // the name of the source where the index was created sourceName // the name of the source where the index was created
); );

Loading…
Cancel
Save