From 61c5e4068743dc0c8f8c9fa6e5e38be31b93e90a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 15 Sep 2013 23:27:04 +0200 Subject: [PATCH] - replaced the properties object in AnchorURL with distinct variables for anchor attributes. - this caused that large portions of the parser code had to be adopted as well - added a counter target_order_i for anchor links in webgraph computation --- defaults/solr.webgraph.schema | 3 + htroot/ViewFile.java | 16 ++-- .../net/yacy/cora/document/id/AnchorURL.java | 78 +++++++++++++++++-- .../net/yacy/cora/document/id/DigestURL.java | 11 --- source/net/yacy/crawler/CrawlStacker.java | 4 +- .../net/yacy/crawler/data/ResultImages.java | 3 +- .../net/yacy/crawler/retrieval/Response.java | 3 +- source/net/yacy/data/BookmarkHelper.java | 4 +- source/net/yacy/document/Condenser.java | 6 +- source/net/yacy/document/Document.java | 62 +++++++-------- source/net/yacy/document/Parser.java | 4 +- source/net/yacy/document/TextParser.java | 12 +-- .../document/importer/MediawikiImporter.java | 5 +- .../yacy/document/parser/audioTagParser.java | 4 +- .../parser/augment/AugmentParser.java | 3 +- .../net/yacy/document/parser/bzipParser.java | 4 +- .../net/yacy/document/parser/csvParser.java | 4 +- .../net/yacy/document/parser/docParser.java | 4 +- .../net/yacy/document/parser/dwgParser.java | 4 +- .../yacy/document/parser/genericParser.java | 4 +- .../net/yacy/document/parser/gzipParser.java | 4 +- .../document/parser/html/ContentScraper.java | 25 +++--- .../yacy/document/parser/html/ImageEntry.java | 14 ++-- .../net/yacy/document/parser/htmlParser.java | 9 ++- .../parser/images/genericImageParser.java | 17 ++-- source/net/yacy/document/parser/mmParser.java | 4 +- .../net/yacy/document/parser/odtParser.java | 3 +- .../net/yacy/document/parser/ooxmlParser.java | 6 +- .../net/yacy/document/parser/pdfParser.java | 4 +- .../net/yacy/document/parser/pptParser.java | 4 +- source/net/yacy/document/parser/psParser.java | 3 +- .../net/yacy/document/parser/rdfParser.java | 4 +- .../document/parser/rdfa/impl/RDFaParser.java | 7 +- .../net/yacy/document/parser/rssParser.java | 7 +- .../net/yacy/document/parser/rtfParser.java | 4 +- .../yacy/document/parser/sevenzipParser.java | 10 +-- .../yacy/document/parser/sidAudioParser.java | 4 +- .../yacy/document/parser/sitemapParser.java | 5 +- .../net/yacy/document/parser/swfParser.java | 5 +- .../net/yacy/document/parser/tarParser.java | 6 +- .../yacy/document/parser/torrentParser.java | 6 +- .../net/yacy/document/parser/vcfParser.java | 5 +- .../net/yacy/document/parser/vsdParser.java | 4 +- .../net/yacy/document/parser/xlsParser.java | 3 +- .../net/yacy/document/parser/zipParser.java | 5 +- .../peers/graphics/WebStructureGraph.java | 5 +- .../net/yacy/repository/LoaderDispatcher.java | 3 +- source/net/yacy/search/Switchboard.java | 11 +-- .../net/yacy/search/index/DocumentIndex.java | 19 ++--- .../search/schema/WebgraphConfiguration.java | 11 ++- .../yacy/search/schema/WebgraphSchema.java | 1 + .../net/yacy/search/snippet/MediaSnippet.java | 9 ++- 52 files changed, 265 insertions(+), 200 deletions(-) diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index d2c505d17..25afe61b3 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -129,6 +129,9 @@ target_name_t ## primary key of document, the URL hash (target) target_id_s +## order number of target url, a count from first to last URL on the source page (target) +target_order_i + ## the protocol of the url (target) target_protocol_s diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ef4137111..01027ad47 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -34,10 +34,9 @@ import java.util.Collection; import java.util.Enumeration; import java.util.Iterator; import java.util.Map; -import java.util.Properties; - import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -315,7 +314,7 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - final Map ts = document.getImages(); + final Map ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -439,15 +438,14 @@ public class ViewFile { final serverObjects prop, final String[] wordArray, int c, - final Map media, + final Map media, final String type, boolean dark) { int i = 0; - for (final Map.Entry entry : media.entrySet()) { - final Properties p = entry.getKey().getProperties(); - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag + for (final Map.Entry entry : media.entrySet()) { + final String name = entry.getKey().getNameProperty(); // the name attribute + final String rel = entry.getKey().getRelProperty(); // the rel-attribute + final String text = entry.getKey().getTextProperty(); // the text between the tag prop.put("viewMode_links_" + c + "_nr", c); prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0)); diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java index d9c01065f..21fb4dd3d 100644 --- a/source/net/yacy/cora/document/id/AnchorURL.java +++ b/source/net/yacy/cora/document/id/AnchorURL.java @@ -28,25 +28,38 @@ public class AnchorURL extends DigestURL { private static final long serialVersionUID = 1586579902179962086L; - private Properties properties; // may contain additional url properties, such as given in html a href-links + private String nameProperty, textProperty, relProperty, hrefProperty; // may contain additional url properties, such as given in html a href-links public AnchorURL(final String url) throws MalformedURLException { super(url); - this.properties = new Properties(); + this.nameProperty = ""; + this.textProperty = ""; + this.relProperty = ""; + this.hrefProperty = ""; + } + + public AnchorURL(final DigestURL url) { + super(url, url.hash()); + this.nameProperty = ""; + this.textProperty = ""; + this.relProperty = ""; + this.hrefProperty = ""; } public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException { super(baseURL, relPath); - this.properties = new Properties(); + this.nameProperty = ""; + this.textProperty = ""; + this.relProperty = ""; + this.hrefProperty = ""; } public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException { super(protocol, host, port, path); - this.properties = new Properties(); - } - - public Properties getProperties() { - return this.properties; + this.nameProperty = ""; + this.textProperty = ""; + this.relProperty = ""; + this.hrefProperty = ""; } public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException { @@ -65,4 +78,53 @@ public class AnchorURL extends DigestURL { } return new AnchorURL(baseURL, relPath); } + + public String getNameProperty() { + return nameProperty; + } + + public void setNameProperty(String name) { + this.nameProperty = name; + } + + public String getTextProperty() { + return textProperty; + } + + public void setTextProperty(String text) { + this.textProperty = text; + } + + public String getRelProperty() { + return relProperty; + } + + public void setRelProperty(String rel) { + this.relProperty = rel; + } + + public String getHrefProperty() { + return hrefProperty; + } + + public void setHrefProperty(String href) { + this.hrefProperty = href; + } + + public void setAll(final Properties tagopts) { + this.nameProperty = tagopts.getProperty("name", ""); + this.textProperty = tagopts.getProperty("text", ""); + this.relProperty = tagopts.getProperty("rel", ""); + this.hrefProperty = tagopts.getProperty("href", ""); + } + + public Properties getAll() { + final Properties tagopts = new Properties(); + tagopts.setProperty("name", this.nameProperty); + tagopts.setProperty("text", this.textProperty); + tagopts.setProperty("rel", this.relProperty); + tagopts.setProperty("href", this.hrefProperty); + return tagopts; + } + } diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java index 48e9cee6c..c1bdb78ed 100644 --- a/source/net/yacy/cora/document/id/DigestURL.java +++ b/source/net/yacy/cora/document/id/DigestURL.java @@ -27,7 +27,6 @@ import java.io.File; import java.io.Serializable; import java.net.MalformedURLException; import java.util.HashSet; -import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; @@ -53,7 +52,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable { // class variables private byte[] hash; - private Properties properties; // may contain additional url properties, such as given in html a href-links /** * Shortcut, calculate hash for shorted url/hostname @@ -117,7 +115,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable { public DigestURL(final String url) throws MalformedURLException { super(url); this.hash = null; - this.properties = new Properties(); } /** @@ -129,7 +126,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable { public DigestURL(final String url, final byte[] hash) throws MalformedURLException { super(url); this.hash = hash; - this.properties = new Properties(); } /** @@ -140,19 +136,16 @@ public class DigestURL extends MultiProtocolURL implements Serializable { public DigestURL(final MultiProtocolURL baseURL, final byte[] hash) { super(baseURL); this.hash = hash; - this.properties = new Properties(); } public DigestURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException { super(baseURL, relPath); this.hash = null; - this.properties = new Properties(); } public DigestURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException { super(protocol, host, port, path); this.hash = null; - this.properties = new Properties(); } public static DigestURL newURL(final DigestURL baseURL, String relPath) throws MalformedURLException { @@ -173,10 +166,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable { } private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful - - public Properties getProperties() { - return this.properties; - } @Override public int hashCode() { diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index d21b487dd..953619fda 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -177,7 +177,7 @@ public final class CrawlStacker { } private void enqueueEntries(final byte[] initiator, final String profileHandle, final List hyperlinks, final boolean replace) { - for (final DigestURL url: hyperlinks) { + for (final AnchorURL url: hyperlinks) { if (url == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) @@ -211,7 +211,7 @@ public final class CrawlStacker { initiator, url, null, - url.getProperties().getProperty("name", ""), + url.getNameProperty(), new Date(), profileHandle, 0, diff --git a/source/net/yacy/crawler/data/ResultImages.java b/source/net/yacy/crawler/data/ResultImages.java index adeecd40c..f8d590ecf 100644 --- a/source/net/yacy/crawler/data/ResultImages.java +++ b/source/net/yacy/crawler/data/ResultImages.java @@ -31,6 +31,7 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.storage.SizeLimitedSet; @@ -61,7 +62,7 @@ public class ResultImages { if (MemoryControl.shortStatus()) clearQueues(); limitQueues(1000); - final Map images = document.getImages(); + final Map images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup if (image == null || image.url() == null) continue; diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 717a65324..17b9d8c16 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -32,6 +32,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.HeaderFramework; @@ -828,7 +829,7 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); + return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index 2d9dfd91f..abc7e2e51 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -145,8 +145,8 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { ConcurrentLog.warn("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (final DigestURL url: links) { - title = url.getProperties().getProperty("name", ""); + for (final AnchorURL url: links) { + title = url.getNameProperty(); ConcurrentLog.info("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed title = url.toString(); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index bceda9b8c..f4100b304 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -43,7 +43,7 @@ import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.language.synonyms.SynonymLibrary; @@ -113,7 +113,7 @@ public final class Condenser { // add the URL components to the word list insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getTextString(), meaningLib, doAutotagging); // the phrase counter: @@ -165,7 +165,7 @@ public final class Condenser { if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index e8e06e51b..af82bbb56 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -77,11 +77,11 @@ public class Document { private Object text; // the clear text, all that is visible private final Collection anchors; // all links embedded as clickeable entities (anchor tags) private final LinkedHashMap rss; // all embedded rss feeds - private final LinkedHashMap images; // all visible pictures in document + private final LinkedHashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private LinkedHashMap audiolinks, videolinks, applinks, hyperlinks; + private LinkedHashMap audiolinks, videolinks, applinks, hyperlinks; private LinkedHashMap inboundlinks, outboundlinks; private Map emaillinks; private MultiProtocolURL favicon; @@ -104,7 +104,7 @@ public class Document { final Object text, final Collection anchors, final LinkedHashMap rss, - final LinkedHashMap images, + final LinkedHashMap images, final boolean indexingDenied, final Date date) { this.source = location; @@ -128,7 +128,7 @@ public class Document { } this.anchors = (anchors == null) ? new ArrayList(0) : anchors; this.rss = (rss == null) ? new LinkedHashMap(0) : rss; - this.images = (images == null) ? new LinkedHashMap() : images; + this.images = (images == null) ? new LinkedHashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; @@ -417,30 +417,30 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); return this.hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!this.resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!this.resorted) resortLinks(); return this.videolinks; } - public Map getImages() { + public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!this.resorted) resortLinks(); return this.images; } - public Map getApplinks() { + public Map getApplinks() { if (!this.resorted) resortLinks(); return this.applinks; } @@ -474,19 +474,19 @@ dc_rights final String thishost = this.source.getHost(); this.inboundlinks = new LinkedHashMap(); this.outboundlinks = new LinkedHashMap(); - this.hyperlinks = new LinkedHashMap(); - this.videolinks = new LinkedHashMap(); - this.audiolinks = new LinkedHashMap(); - this.applinks = new LinkedHashMap(); + this.hyperlinks = new LinkedHashMap(); + this.videolinks = new LinkedHashMap(); + this.audiolinks = new LinkedHashMap(); + this.applinks = new LinkedHashMap(); this.emaillinks = new LinkedHashMap(); - final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + for (final Map.Entry entry: collectedImages.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } - for (final DigestURL url: this.anchors) { + for (final AnchorURL url: this.anchors) { if (url == null) continue; - final boolean noindex = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; - final boolean nofollow = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("nofollow",0) >= 0; + final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0; + final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0; if ((thishost == null && url.getHost() == null) || ((thishost != null && url.getHost() != null) && (url.getHost().endsWith(thishost) || @@ -496,7 +496,7 @@ dc_rights this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } u = url.toNormalform(true); - final String name = url.getProperties().getProperty("name", ""); + final String name = url.getNameProperty(); if (u.startsWith("mailto:")) { this.emaillinks.put(u.substring(7), name); } else { @@ -592,23 +592,23 @@ dc_rights return v; } - private static Map allReflinks(final Collection links) { + private static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final Map v = new HashMap(); + final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; - DigestURL url = null; + AnchorURL url = null; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof DigestURL) - url = (DigestURL) o; + if (o instanceof AnchorURL) + url = (AnchorURL) o; else if (o instanceof String) - url = new DigestURL((String) o); + url = new AnchorURL((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { @@ -622,7 +622,7 @@ dc_rights u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); - url = new DigestURL(u); + url = new AnchorURL(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -632,7 +632,7 @@ dc_rights u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); - url = new DigestURL(u); + url = new AnchorURL(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -783,7 +783,7 @@ dc_rights final Collection sectionTitles = new LinkedHashSet(); final List anchors = new ArrayList(); final LinkedHashMap rss = new LinkedHashMap(); - final LinkedHashMap images = new LinkedHashMap(); + final LinkedHashMap images = new LinkedHashMap(); double lon = 0.0d, lat = 0.0d; Date date = new Date(); @@ -890,7 +890,7 @@ dc_rights public static Map getAudiolinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.audiolinks.entrySet()) { + for (Map.Entry e: d.audiolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } @@ -900,7 +900,7 @@ dc_rights public static Map getVideolinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.videolinks.entrySet()) { + for (Map.Entry e: d.videolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } @@ -910,7 +910,7 @@ dc_rights public static Map getApplinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.applinks.entrySet()) { + for (Map.Entry e: d.applinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index f930dcfd5..272666e8f 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -26,7 +26,7 @@ package net.yacy.document; import java.io.InputStream; import java.util.Set; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; public interface Parser { @@ -54,7 +54,7 @@ public interface Parser { * @throws InterruptedException */ public Document[] parse( - DigestURL url, + AnchorURL url, String mimeType, String charset, InputStream source diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 52ebf8f7b..8e8a153f2 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -34,7 +34,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.bzipParser; @@ -156,7 +156,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURL location, + final AnchorURL location, final String mimeType, final String charset, final File sourceFile @@ -186,7 +186,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURL location, + final AnchorURL location, String mimeType, final String charset, final byte[] content @@ -209,7 +209,7 @@ public final class TextParser { } public static Document[] parseSource( - final DigestURL location, + final AnchorURL location, String mimeType, final String charset, final long contentLength, @@ -248,7 +248,7 @@ public final class TextParser { } private static Document[] parseSource( - final DigestURL location, + final AnchorURL location, final String mimeType, final Parser parser, final String charset, @@ -269,7 +269,7 @@ public final class TextParser { } private static Document[] parseSource( - final DigestURL location, + final AnchorURL location, final String mimeType, final Set parsers, final String charset, diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index b6fa2f445..56c4eea5a 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -51,6 +51,7 @@ import java.util.concurrent.TimeoutException; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; @@ -501,7 +502,7 @@ public class MediawikiImporter extends Thread implements Importer { public class wikiparserrecord { public String title; String source, html, hostport, urlStub; - DigestURL url; + AnchorURL url; Document document; public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) { this.title = title; @@ -520,7 +521,7 @@ public class MediawikiImporter extends Thread implements Importer { } public void genDocument() throws Parser.Failure { try { - this.url = new DigestURL(this.urlStub + this.title); + this.url = new AnchorURL(this.urlStub + this.title); final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index e405996d0..35d5c8c93 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -35,7 +35,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -69,7 +69,7 @@ public class audioTagParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 803169617..bede9830e 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -6,6 +6,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.ymark.YMarkUtil; @@ -35,7 +36,7 @@ public class AugmentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 4e2f0ab7b..6e088fafd 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -31,7 +31,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -56,7 +56,7 @@ public class bzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index b690485d9..7af75791c 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -33,7 +33,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -51,7 +51,7 @@ public class csvParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 297adf3f2..301f3ce1a 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -30,7 +30,7 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -54,7 +54,7 @@ public class docParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/dwgParser.java b/source/net/yacy/document/parser/dwgParser.java index 7dff5137e..e5980df64 100644 --- a/source/net/yacy/document/parser/dwgParser.java +++ b/source/net/yacy/document/parser/dwgParser.java @@ -25,7 +25,7 @@ package net.yacy.document.parser; import java.io.InputStream; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -60,7 +60,7 @@ public class dwgParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index e285f0624..748767462 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -27,7 +27,7 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -45,7 +45,7 @@ public class genericParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source1) throws Parser.Failure, InterruptedException { String filename = location.getFileName(); diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 72d9c952d..d62022085 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -32,7 +32,7 @@ import java.io.FileOutputStream; import java.io.InputStream; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -55,7 +55,7 @@ public class gzipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs = null; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 7b36537fa..702fbd344 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -325,7 +325,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final DigestURL url = absolutePath(src); + final AnchorURL url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); @@ -342,7 +342,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (tagname.equalsIgnoreCase("frame")) { final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); - src.getProperties().putAll(tagopts); + src.setAll(tagopts); this.anchors.add(src); this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); @@ -378,10 +378,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { //String alt = tagopts.getProperty("alt",EMPTY_STRING); final String href = tagopts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { - tagopts.put("nme", areatitle); + tagopts.put("name", areatitle); AnchorURL url = absolutePath(href); tagopts.put("href", url.toNormalform(true)); - url.getProperties().putAll(tagopts); + url.setAll(tagopts); this.anchors.add(url); } } else if (tagname.equalsIgnoreCase("link")) { @@ -401,7 +401,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.favicon = newLink; } else if (rel.equalsIgnoreCase("canonical")) { tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); - newLink.getProperties().putAll(tagopts); + newLink.setAll(tagopts); this.anchors.add(newLink); this.canonical = newLink; } else if (rel.equalsIgnoreCase("publisher")) { @@ -417,7 +417,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.csspath, href); } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { tagopts.put("name", linktitle); - newLink.getProperties().putAll(tagopts); + newLink.setAll(tagopts); this.anchors.add(newLink); } } @@ -432,7 +432,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { tagopts.put("src", url.toNormalform(true)); final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING)); this.embeds.put(url, ie); - url.getProperties().putAll(tagopts); + url.setAll(tagopts); this.anchors.add(url); } } @@ -442,13 +442,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (name.equalsIgnoreCase("movie")) { AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); tagopts.put("value", url.toNormalform(true)); - url.getProperties().putAll(tagopts); + url.setAll(tagopts); this.anchors.add(url); } } else if (tagname.equalsIgnoreCase("iframe")) { final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); - src.getProperties().putAll(tagopts); + src.setAll(tagopts); this.anchors.add(src); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true)); @@ -475,9 +475,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1); this.images.add(ie); } else { - tagopts.put("text", recursiveParse(url, text)); + tagopts.put("text", new String(text)); tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute - url.getProperties().putAll(tagopts); + url.setAll(tagopts); + recursiveParse(url, text); this.anchors.add(url); } } @@ -541,7 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" ")); } - private String recursiveParse(final DigestURL linkurl, final char[] inlineHtml) { + private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) { if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); // start a new scraper to parse links inside this text diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index f1d160f05..60003f23c 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -26,12 +26,12 @@ package net.yacy.document.parser.html; import java.util.Comparator; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; public class ImageEntry implements Comparable, Comparator { - private final DigestURL imageurl; - private DigestURL linkurl; + private final AnchorURL imageurl; + private AnchorURL linkurl; private final String alt; private String anchortext; private final int width, height; @@ -49,7 +49,7 @@ public class ImageEntry implements Comparable, Comparator, Comparator noDoubleImages = new LinkedHashMap(); + LinkedHashMap noDoubleImages = new LinkedHashMap(); for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie); final Document ppd = new Document( location, @@ -301,9 +302,9 @@ public class htmlParser extends AbstractParser implements Parser { public static void main(final String[] args) { // test parsing of a url - DigestURL url; + AnchorURL url; try { - url = new DigestURL(args[0]); + url = new AnchorURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent); final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); final String title = document[0].dc_title(); diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 7deeb71b7..420c64417 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -46,7 +46,6 @@ import java.util.Set; import javax.imageio.ImageIO; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -95,7 +94,7 @@ public class genericImageParser extends AbstractParser implements Parser { @Override public Document[] parse( - final DigestURL location, + final AnchorURL location, final String mimeType, final String documentCharset, final InputStream sourceStream) throws Parser.Failure, InterruptedException { @@ -199,7 +198,7 @@ public class genericImageParser extends AbstractParser implements Parser { final HashSet languages = new HashSet(); final List anchors = new ArrayList(); - final LinkedHashMap images = new LinkedHashMap(); + final LinkedHashMap images = new LinkedHashMap(); // add this image to the map of images final String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); @@ -238,7 +237,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final DigestURL location, + final AnchorURL location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; try { @@ -253,7 +252,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final DigestURL location, + final AnchorURL location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); ii.image = image; @@ -290,12 +289,12 @@ public class genericImageParser extends AbstractParser implements Parser { } public static class ImageInfo { - public DigestURL location; + public AnchorURL location; public BufferedImage image; public StringBuilder info; public int height; public int width; - public ImageInfo(final DigestURL location) { + public ImageInfo(final AnchorURL location) { this.location = location; this.image = null; this.info = new StringBuilder(); @@ -309,9 +308,9 @@ public class genericImageParser extends AbstractParser implements Parser { public static void main(final String[] args) { final File image = new File(args[0]); final genericImageParser parser = new genericImageParser(); - DigestURL uri; + AnchorURL uri; try { - uri = new DigestURL("http://localhost/" + image.getName()); + uri = new AnchorURL("http://localhost/" + image.getName()); final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java index a0bcc6ceb..d326bd02e 100644 --- a/source/net/yacy/document/parser/mmParser.java +++ b/source/net/yacy/document/parser/mmParser.java @@ -35,7 +35,7 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -70,7 +70,7 @@ public class mmParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index c30f03c62..5e344f111 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -43,6 +43,7 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -214,7 +215,7 @@ public class odtParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index d4a52ae24..6371ea5c0 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -43,7 +43,7 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -91,7 +91,7 @@ public class ooxmlParser extends AbstractParser implements Parser { return parser; } - private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException { + private Document[] parse(final AnchorURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException { CharBuffer writer = null; try { @@ -201,7 +201,7 @@ public class ooxmlParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index b759798ec..72181ca7a 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -55,7 +55,7 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.util.PDFTextStripper; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 4f0128b6a..f21f188bf 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -31,7 +31,7 @@ import java.io.BufferedInputStream; import java.io.InputStream; import java.util.Date; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -60,7 +60,7 @@ public class pptParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index 707b7a3ed..0d3f5ea53 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -36,6 +36,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.Date; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -256,7 +257,7 @@ public class psParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index 5079f0475..14c8b26c3 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -30,7 +30,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -45,7 +45,7 @@ public class rdfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index cf8accb70..ceeaff2f7 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -17,6 +17,7 @@ import java.util.Date; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -46,7 +47,7 @@ public class RDFaParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURL url, String mimeType, + public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { @@ -95,7 +96,7 @@ public class RDFaParser extends AbstractParser implements Parser { return doc; } - private Document[] parseHtml(DigestURL url, String mimeType, + private Document[] parseHtml(AnchorURL url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { @@ -178,7 +179,7 @@ public class RDFaParser extends AbstractParser implements Parser { if (aReader != null) { RDFaParser aParser = new RDFaParser(); try { - aParser.parse(new DigestURL(args[0]),"","",aURL.openStream()); + aParser.parse(new AnchorURL(args[0]),"","",aURL.openStream()); } catch (final FileNotFoundException e) { e.printStackTrace(); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 231883904..a610163f8 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -38,7 +38,6 @@ import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -58,7 +57,7 @@ public class rssParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { RSSReader rssReader; @@ -80,7 +79,7 @@ public class rssParser extends AbstractParser implements Parser { languages = new HashSet(); languages.add(item.getLanguage()); anchors = new ArrayList(); - uri.getProperties().put("name", item.getTitle()); + uri.setNameProperty(item.getTitle()); anchors.add(uri); doc = new Document( uri, @@ -99,7 +98,7 @@ public class rssParser extends AbstractParser implements Parser { null, anchors, null, - new LinkedHashMap(), + new LinkedHashMap(), false, item.getPubDate()); docs.add(doc); diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index 1ac87a76a..d03e7836b 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -33,7 +33,7 @@ import java.util.Date; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -52,7 +52,7 @@ public class rtfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index b58d7e6a7..245521540 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -34,7 +34,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Date; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -55,7 +55,7 @@ public class sevenzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } - public Document parse(final DigestURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { + public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { final Document doc = new Document( location, mimeType, @@ -100,12 +100,12 @@ public class sevenzipParser extends AbstractParser implements Parser { } } - public Document parse(final DigestURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { + public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException { return parse(location, mimeType, charset, new ByteArrayIInStream(source)); } @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); @@ -169,7 +169,7 @@ public class sevenzipParser extends AbstractParser implements Parser { Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURL url = DigestURL.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); + final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java index 799ef60a2..ad13aeca4 100644 --- a/source/net/yacy/document/parser/sidAudioParser.java +++ b/source/net/yacy/document/parser/sidAudioParser.java @@ -31,7 +31,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Map; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -57,7 +57,7 @@ public class sidAudioParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 17a472f6d..531785ece 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -40,6 +40,7 @@ import java.util.zip.GZIPInputStream; import javax.xml.parsers.DocumentBuilderFactory; import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -68,7 +69,7 @@ public class sitemapParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) throws Failure, InterruptedException { final List docs = new ArrayList(); @@ -95,7 +96,7 @@ public class sitemapParser extends AbstractParser implements Parser { null, null, null, - new LinkedHashMap(), + new LinkedHashMap(), false, new Date()); docs.add(doc); diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 6be3f8edb..c009782fb 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -34,7 +34,6 @@ import java.util.Date; import java.util.List; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -56,7 +55,7 @@ public class swfParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -97,7 +96,7 @@ public class swfParser extends AbstractParser implements Parser { url = contents.substring(urlStart,urlEnd); urlnr = Integer.toString(++urls).toString(); AnchorURL u = new AnchorURL(url); - u.getProperties().put("name", urlnr); + u.setNameProperty(urlnr); anchors.add(u); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index f70715218..c15737bcd 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -34,7 +34,7 @@ import java.util.List; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -61,7 +61,7 @@ public class tarParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); Document[] subDocs = null; @@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(DigestURL.newURL(url, "#" + name), mime, null, tmp); + subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 5c3ff5d1f..cf522ca04 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -33,7 +33,7 @@ import java.util.List; import java.util.Map; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; import net.yacy.document.Condenser; @@ -56,7 +56,7 @@ public class torrentParser extends AbstractParser implements Parser { } @Override - public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source) + public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { @@ -119,7 +119,7 @@ public class torrentParser extends AbstractParser implements Parser { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document[] d = parser.parse(new DigestURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); + Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 99b46f281..1c78b213f 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -41,7 +41,6 @@ import java.util.List; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -65,7 +64,7 @@ public class vcfParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -180,7 +179,7 @@ public class vcfParser extends AbstractParser implements Parser { } else if (key.toUpperCase().startsWith("URL")) { try { final AnchorURL newURL = new AnchorURL(value); - newURL.getProperties().put("name", newURL.toString()); + newURL.setNameProperty(newURL.toString()); anchors.add(newURL); //parsedData.put(key,value); } catch (final MalformedURLException ex) {/* ignore this */} diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index cd392d28f..59c1e484a 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -32,7 +32,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -66,7 +66,7 @@ public class vsdParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index ccd3d7d7e..beffaf038 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -30,6 +30,7 @@ package net.yacy.document.parser; import java.io.InputStream; import java.util.Date; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -66,7 +67,7 @@ public class xlsParser extends AbstractParser implements Parser { * all extracted information about the parsed document */ @Override - public Document[] parse(final DigestURL location, final String mimeType, + public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index aaaeb1527..ff232d723 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -32,6 +32,7 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -60,7 +61,7 @@ public class zipParser extends AbstractParser implements Parser { } @Override - public Document[] parse(final DigestURL url, final String mimeType, + public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser @@ -88,7 +89,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final DigestURL virtualURL = DigestURL.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(virtualURL, mime, null, tmp); + docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, tmp); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 8dc512ce5..8b652f892 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -47,6 +47,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.sorting.ClusteredScoreMap; @@ -159,8 +160,8 @@ public class WebStructureGraph { public void generateCitationReference(final DigestURL url, final Document document) { // generate citation reference - final Map hl = document.getHyperlinks(); - final Iterator it = hl.keySet().iterator(); + final Map hl = document.getHyperlinks(); + final Iterator it = hl.keySet().iterator(); final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); DigestURL u; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 776383dc3..6ad3273ef 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -40,6 +40,7 @@ import java.util.concurrent.TimeUnit; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; @@ -384,7 +385,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index bb2a8b74c..f2d7d5943 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -94,6 +94,7 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; @@ -2480,7 +2481,7 @@ public final class Switchboard extends serverSwitch { // parse the document documents = TextParser.parseSource( - response.url(), + new AnchorURL(response.url()), response.getMimeType(), response.getCharacterEncoding(), response.getContent()); @@ -3408,9 +3409,9 @@ public final class Switchboard extends serverSwitch { } // get the links for a specific site - DigestURL url; + AnchorURL url; try { - url = new DigestURL(r); + url = new AnchorURL(r); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); return; @@ -3447,9 +3448,9 @@ public final class Switchboard extends serverSwitch { public void run() { // get the links for a specific site - final DigestURL startUrl; + final AnchorURL startUrl; try { - startUrl = new DigestURL(url); + startUrl = new AnchorURL(url); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); return; diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index cc4bed207..d9f2dc3fc 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -34,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; @@ -52,14 +53,14 @@ import net.yacy.search.schema.WebgraphConfiguration; */ public class DocumentIndex extends Segment { - private static DigestURL poison; + private static AnchorURL poison; static { try { - poison = new DigestURL("file://."); + poison = new AnchorURL("file://."); } catch (final MalformedURLException e ) { } } - BlockingQueue queue; // a queue of document ID's + BlockingQueue queue; // a queue of document ID's private final Worker[] worker; CallbackListener callback; @@ -80,7 +81,7 @@ public class DocumentIndex extends Segment { super.fulltext().connectLocalSolr(); super.fulltext().writeWebgraph(true); this.callback = callback; - this.queue = new LinkedBlockingQueue(WorkflowProcessor.availableCPU * 300); + this.queue = new LinkedBlockingQueue(WorkflowProcessor.availableCPU * 300); this.worker = new Worker[WorkflowProcessor.availableCPU]; for ( int i = 0; i < WorkflowProcessor.availableCPU; i++ ) { this.worker[i] = new Worker(i); @@ -96,7 +97,7 @@ public class DocumentIndex extends Segment { @Override public void run() { - DigestURL f; + AnchorURL f; SolrInputDocument[] resultRows; try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { @@ -134,7 +135,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private SolrInputDocument[] add(final DigestURL url) throws IOException { + private SolrInputDocument[] add(final AnchorURL url) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -183,7 +184,7 @@ public class DocumentIndex extends Segment { * * @param start */ - public void addConcurrent(final DigestURL start) throws IOException { + public void addConcurrent(final AnchorURL start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); if ( !start.isDirectory() ) { @@ -194,10 +195,10 @@ public class DocumentIndex extends Segment { return; } final String[] s = start.list(); - DigestURL w; + AnchorURL w; for ( final String t : s ) { try { - w = new DigestURL(start, t); + w = new AnchorURL(start, t); if ( w.canRead() && !w.isHidden() ) { if ( w.isDirectory() ) { addConcurrent(w); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index ad9a1c241..9b7ab6362 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -34,7 +34,6 @@ import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.regex.Pattern; @@ -120,15 +119,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial final List images, final boolean inbound, final Collection links, final IndexCell citations) { boolean allAttr = this.isEmpty(); + int target_order = 0; for (final AnchorURL target_url: links) { Set processTypes = new LinkedHashSet(); - final Properties p = target_url.getProperties(); - if (p == null) continue; - final String name = p.getProperty("name", ""); // the name attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String rel = p.getProperty("rel", ""); // the rel-attribute + final String name = target_url.getNameProperty(); // the name attribute + final String text = target_url.getTextProperty(); // the text between the tag + final String rel = target_url.getRelProperty(); // the rel-attribute int ioidx = inbound ? 0 : 1; // index organization @@ -140,6 +138,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi); SolrInputDocument edge = new SolrInputDocument(); add(edge, WebgraphSchema.id, id.toString()); + add(edge, WebgraphSchema.target_order_i, target_order++); if (allAttr || contains(WebgraphSchema.load_date_dt)) { Date loadDate = new Date(); Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 3086edf50..096a15d9a 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -72,6 +72,7 @@ public enum WebgraphSchema implements SchemaDeclaration { // target information target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"), + target_order_i(SolrType.num_integer, true, true, false, false, false, "order number of target url, a count from first to last URL on the source page (target)"), target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"), target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"), target_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (target)"), diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 0ac454cde..0be262a08 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -38,6 +38,7 @@ import java.util.TreeSet; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; @@ -164,15 +165,15 @@ public class MediaSnippet implements Comparable, Comparator computeMediaSnippets(final DigestURL source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks(); else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks(); else if (mediatype == ContentDomain.APP) media = document.getApplinks(); if (media == null) return null; - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; - DigestURL url; + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; + AnchorURL url; String desc; final List result = new ArrayList(); while (i.hasNext()) {