- replaced the properties object in AnchorURL with distinct variables

for anchor attributes.
- this caused that large portions of the parser code had to be adopted
as well
- added a counter target_order_i for anchor links in webgraph
computation
pull/1/head
Michael Peter Christen 12 years ago
parent 3ea9bb4427
commit 61c5e40687

@ -129,6 +129,9 @@ target_name_t
## primary key of document, the URL hash (target)
target_id_s
## order number of target url, a count from first to last URL on the source page (target)
target_order_i
## the protocol of the url (target)
target_protocol_s

@ -34,10 +34,9 @@ import java.util.Collection;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -315,7 +314,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
final Map<DigestURL, ImageEntry> ts = document.getImages();
final Map<AnchorURL, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {
@ -439,15 +438,14 @@ public class ViewFile {
final serverObjects prop,
final String[] wordArray,
int c,
final Map<DigestURL, String> media,
final Map<AnchorURL, String> media,
final String type,
boolean dark) {
int i = 0;
for (final Map.Entry<DigestURL, String> entry : media.entrySet()) {
final Properties p = entry.getKey().getProperties();
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
for (final Map.Entry<AnchorURL, String> entry : media.entrySet()) {
final String name = entry.getKey().getNameProperty(); // the name attribute
final String rel = entry.getKey().getRelProperty(); // the rel-attribute
final String text = entry.getKey().getTextProperty(); // the text between the <a></a> tag
prop.put("viewMode_links_" + c + "_nr", c);
prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));

@ -28,25 +28,38 @@ public class AnchorURL extends DigestURL {
private static final long serialVersionUID = 1586579902179962086L;
private Properties properties; // may contain additional url properties, such as given in html a href-links
private String nameProperty, textProperty, relProperty, hrefProperty; // may contain additional url properties, such as given in html a href-links
public AnchorURL(final String url) throws MalformedURLException {
super(url);
this.properties = new Properties();
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
}
public AnchorURL(final DigestURL url) {
super(url, url.hash());
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
}
public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath);
this.properties = new Properties();
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
}
public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path);
this.properties = new Properties();
}
public Properties getProperties() {
return this.properties;
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
}
public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException {
@ -65,4 +78,53 @@ public class AnchorURL extends DigestURL {
}
return new AnchorURL(baseURL, relPath);
}
public String getNameProperty() {
return nameProperty;
}
public void setNameProperty(String name) {
this.nameProperty = name;
}
public String getTextProperty() {
return textProperty;
}
public void setTextProperty(String text) {
this.textProperty = text;
}
public String getRelProperty() {
return relProperty;
}
public void setRelProperty(String rel) {
this.relProperty = rel;
}
public String getHrefProperty() {
return hrefProperty;
}
public void setHrefProperty(String href) {
this.hrefProperty = href;
}
public void setAll(final Properties tagopts) {
this.nameProperty = tagopts.getProperty("name", "");
this.textProperty = tagopts.getProperty("text", "");
this.relProperty = tagopts.getProperty("rel", "");
this.hrefProperty = tagopts.getProperty("href", "");
}
public Properties getAll() {
final Properties tagopts = new Properties();
tagopts.setProperty("name", this.nameProperty);
tagopts.setProperty("text", this.textProperty);
tagopts.setProperty("rel", this.relProperty);
tagopts.setProperty("href", this.hrefProperty);
return tagopts;
}
}

@ -27,7 +27,6 @@ import java.io.File;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
@ -53,7 +52,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
// class variables
private byte[] hash;
private Properties properties; // may contain additional url properties, such as given in html a href-links
/**
* Shortcut, calculate hash for shorted url/hostname
@ -117,7 +115,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
public DigestURL(final String url) throws MalformedURLException {
super(url);
this.hash = null;
this.properties = new Properties();
}
/**
@ -129,7 +126,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
public DigestURL(final String url, final byte[] hash) throws MalformedURLException {
super(url);
this.hash = hash;
this.properties = new Properties();
}
/**
@ -140,19 +136,16 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
public DigestURL(final MultiProtocolURL baseURL, final byte[] hash) {
super(baseURL);
this.hash = hash;
this.properties = new Properties();
}
public DigestURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath);
this.hash = null;
this.properties = new Properties();
}
public DigestURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path);
this.hash = null;
this.properties = new Properties();
}
public static DigestURL newURL(final DigestURL baseURL, String relPath) throws MalformedURLException {
@ -173,10 +166,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
}
private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful
public Properties getProperties() {
return this.properties;
}
@Override
public int hashCode() {

@ -177,7 +177,7 @@ public final class CrawlStacker {
}
private void enqueueEntries(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks, final boolean replace) {
for (final DigestURL url: hyperlinks) {
for (final AnchorURL url: hyperlinks) {
if (url == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
@ -211,7 +211,7 @@ public final class CrawlStacker {
initiator,
url,
null,
url.getProperties().getProperty("name", ""),
url.getNameProperty(),
new Date(),
profileHandle,
0,

@ -31,6 +31,7 @@ import java.util.Queue;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.storage.SizeLimitedSet;
@ -61,7 +62,7 @@ public class ResultImages {
if (MemoryControl.shortStatus()) clearQueues();
limitQueues(1000);
final Map<DigestURL, ImageEntry> images = document.getImages();
final Map<AnchorURL, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (image == null || image.url() == null) continue;

@ -32,6 +32,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.HeaderFramework;
@ -828,7 +829,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (final Exception e) {
return null;
}

@ -145,8 +145,8 @@ public class BookmarkHelper {
writer.close();
links = scraper.getAnchors();
} catch (final IOException e) { ConcurrentLog.warn("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
for (final DigestURL url: links) {
title = url.getProperties().getProperty("name", "");
for (final AnchorURL url: links) {
title = url.getNameProperty();
ConcurrentLog.info("BOOKMARKS", "links.get(url)");
if ("".equals(title)) {//cannot be displayed
title = url.toString();

@ -43,7 +43,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.language.synonyms.SynonymLibrary;
@ -113,7 +113,7 @@ public final class Condenser {
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<DigestURL, String> entry;
Map.Entry<AnchorURL, String> entry;
if (indexText) {
createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter:
@ -165,7 +165,7 @@ public final class Condenser {
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Iterator<Map.Entry<DigestURL, String>> i = document.getAudiolinks().entrySet().iterator();
Iterator<Map.Entry<AnchorURL, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);

@ -77,11 +77,11 @@ public class Document {
private Object text; // the clear text, all that is visible
private final Collection<AnchorURL> anchors; // all links embedded as clickeable entities (anchor tags)
private final LinkedHashMap<DigestURL, String> rss; // all embedded rss feeds
private final LinkedHashMap<DigestURL, ImageEntry> images; // all visible pictures in document
private final LinkedHashMap<AnchorURL, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private LinkedHashMap<DigestURL, String> audiolinks, videolinks, applinks, hyperlinks;
private LinkedHashMap<AnchorURL, String> audiolinks, videolinks, applinks, hyperlinks;
private LinkedHashMap<DigestURL, String> inboundlinks, outboundlinks;
private Map<String, String> emaillinks;
private MultiProtocolURL favicon;
@ -104,7 +104,7 @@ public class Document {
final Object text,
final Collection<AnchorURL> anchors,
final LinkedHashMap<DigestURL, String> rss,
final LinkedHashMap<DigestURL, ImageEntry> images,
final LinkedHashMap<AnchorURL, ImageEntry> images,
final boolean indexingDenied,
final Date date) {
this.source = location;
@ -128,7 +128,7 @@ public class Document {
}
this.anchors = (anchors == null) ? new ArrayList<AnchorURL>(0) : anchors;
this.rss = (rss == null) ? new LinkedHashMap<DigestURL, String>(0) : rss;
this.images = (images == null) ? new LinkedHashMap<DigestURL, ImageEntry>() : images;
this.images = (images == null) ? new LinkedHashMap<AnchorURL, ImageEntry>() : images;
this.publisher = publisher;
this.hyperlinks = null;
this.audiolinks = null;
@ -417,30 +417,30 @@ dc_rights
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<DigestURL, String> getHyperlinks() {
public Map<AnchorURL, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!this.resorted) resortLinks();
return this.hyperlinks;
}
public Map<DigestURL, String> getAudiolinks() {
public Map<AnchorURL, String> getAudiolinks() {
if (!this.resorted) resortLinks();
return this.audiolinks;
}
public Map<DigestURL, String> getVideolinks() {
public Map<AnchorURL, String> getVideolinks() {
if (!this.resorted) resortLinks();
return this.videolinks;
}
public Map<DigestURL, ImageEntry> getImages() {
public Map<AnchorURL, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!this.resorted) resortLinks();
return this.images;
}
public Map<DigestURL, String> getApplinks() {
public Map<AnchorURL, String> getApplinks() {
if (!this.resorted) resortLinks();
return this.applinks;
}
@ -474,19 +474,19 @@ dc_rights
final String thishost = this.source.getHost();
this.inboundlinks = new LinkedHashMap<DigestURL, String>();
this.outboundlinks = new LinkedHashMap<DigestURL, String>();
this.hyperlinks = new LinkedHashMap<DigestURL, String>();
this.videolinks = new LinkedHashMap<DigestURL, String>();
this.audiolinks = new LinkedHashMap<DigestURL, String>();
this.applinks = new LinkedHashMap<DigestURL, String>();
this.hyperlinks = new LinkedHashMap<AnchorURL, String>();
this.videolinks = new LinkedHashMap<AnchorURL, String>();
this.audiolinks = new LinkedHashMap<AnchorURL, String>();
this.applinks = new LinkedHashMap<AnchorURL, String>();
this.emaillinks = new LinkedHashMap<String, String>();
final Map<DigestURL, ImageEntry> collectedImages = new HashMap<DigestURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<DigestURL, ImageEntry> entry: collectedImages.entrySet()) {
final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<AnchorURL, ImageEntry> entry: collectedImages.entrySet()) {
if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
}
for (final DigestURL url: this.anchors) {
for (final AnchorURL url: this.anchors) {
if (url == null) continue;
final boolean noindex = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0;
final boolean nofollow = url.getProperties().getProperty("rel", "").toLowerCase().indexOf("nofollow",0) >= 0;
final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0;
final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0;
if ((thishost == null && url.getHost() == null) ||
((thishost != null && url.getHost() != null) &&
(url.getHost().endsWith(thishost) ||
@ -496,7 +496,7 @@ dc_rights
this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
}
u = url.toNormalform(true);
final String name = url.getProperties().getProperty("name", "");
final String name = url.getNameProperty();
if (u.startsWith("mailto:")) {
this.emaillinks.put(u.substring(7), name);
} else {
@ -592,23 +592,23 @@ dc_rights
return v;
}
private static Map<DigestURL, String> allReflinks(final Collection<?> links) {
private static Map<AnchorURL, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final Map<DigestURL, String> v = new HashMap<DigestURL, String>();
final Map<AnchorURL, String> v = new HashMap<AnchorURL, String>();
final Iterator<?> i = links.iterator();
Object o;
DigestURL url = null;
AnchorURL url = null;
String u;
int pos;
loop: while (i.hasNext())
try {
o = i.next();
if (o instanceof DigestURL)
url = (DigestURL) o;
if (o instanceof AnchorURL)
url = (AnchorURL) o;
else if (o instanceof String)
url = new DigestURL((String) o);
url = new AnchorURL((String) o);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
@ -622,7 +622,7 @@ dc_rights
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new DigestURL(u);
url = new AnchorURL(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -632,7 +632,7 @@ dc_rights
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
url = new DigestURL(u);
url = new AnchorURL(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -783,7 +783,7 @@ dc_rights
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<DigestURL, ImageEntry>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
double lon = 0.0d, lat = 0.0d;
Date date = new Date();
@ -890,7 +890,7 @@ dc_rights
public static Map<DigestURL, String> getAudiolinks(final Document[] documents) {
final Map<DigestURL, String> result = new HashMap<DigestURL, String>();
for (final Document d: documents) {
for (Map.Entry<DigestURL, String> e: d.audiolinks.entrySet()) {
for (Map.Entry<AnchorURL, String> e: d.audiolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
@ -900,7 +900,7 @@ dc_rights
public static Map<DigestURL, String> getVideolinks(final Document[] documents) {
final Map<DigestURL, String> result = new HashMap<DigestURL, String>();
for (final Document d: documents) {
for (Map.Entry<DigestURL, String> e: d.videolinks.entrySet()) {
for (Map.Entry<AnchorURL, String> e: d.videolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
@ -910,7 +910,7 @@ dc_rights
public static Map<DigestURL, String> getApplinks(final Document[] documents) {
final Map<DigestURL, String> result = new HashMap<DigestURL, String>();
for (final Document d: documents) {
for (Map.Entry<DigestURL, String> e: d.applinks.entrySet()) {
for (Map.Entry<AnchorURL, String> e: d.applinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}

@ -26,7 +26,7 @@ package net.yacy.document;
import java.io.InputStream;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
public interface Parser {
@ -54,7 +54,7 @@ public interface Parser {
* @throws InterruptedException
*/
public Document[] parse(
DigestURL url,
AnchorURL url,
String mimeType,
String charset,
InputStream source

@ -34,7 +34,7 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@ -156,7 +156,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final DigestURL location,
final AnchorURL location,
final String mimeType,
final String charset,
final File sourceFile
@ -186,7 +186,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final DigestURL location,
final AnchorURL location,
String mimeType,
final String charset,
final byte[] content
@ -209,7 +209,7 @@ public final class TextParser {
}
public static Document[] parseSource(
final DigestURL location,
final AnchorURL location,
String mimeType,
final String charset,
final long contentLength,
@ -248,7 +248,7 @@ public final class TextParser {
}
private static Document[] parseSource(
final DigestURL location,
final AnchorURL location,
final String mimeType,
final Parser parser,
final String charset,
@ -269,7 +269,7 @@ public final class TextParser {
}
private static Document[] parseSource(
final DigestURL location,
final AnchorURL location,
final String mimeType,
final Set<Parser> parsers,
final String charset,

@ -51,6 +51,7 @@ import java.util.concurrent.TimeoutException;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
@ -501,7 +502,7 @@ public class MediawikiImporter extends Thread implements Importer {
public class wikiparserrecord {
public String title;
String source, html, hostport, urlStub;
DigestURL url;
AnchorURL url;
Document document;
public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
this.title = title;
@ -520,7 +521,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
public void genDocument() throws Parser.Failure {
try {
this.url = new DigestURL(this.urlStub + this.title);
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here

@ -35,7 +35,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -69,7 +69,7 @@ public class audioTagParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {

@ -6,6 +6,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ymark.YMarkUtil;
@ -35,7 +36,7 @@ public class AugmentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(DigestURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);

@ -31,7 +31,7 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -56,7 +56,7 @@ public class bzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {

@ -33,7 +33,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -51,7 +51,7 @@ public class csvParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.

@ -30,7 +30,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -54,7 +54,7 @@ public class docParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {

@ -25,7 +25,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -60,7 +60,7 @@ public class dwgParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true))

@ -27,7 +27,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -45,7 +45,7 @@ public class genericParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source1)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();

@ -32,7 +32,7 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -55,7 +55,7 @@ public class gzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs = null;

@ -325,7 +325,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final DigestURL url = absolutePath(src);
final AnchorURL url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
@ -342,7 +342,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tagname.equalsIgnoreCase("frame")) {
final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
src.getProperties().putAll(tagopts);
src.setAll(tagopts);
this.anchors.add(src);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
@ -378,10 +378,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
tagopts.put("nme", areatitle);
tagopts.put("name", areatitle);
AnchorURL url = absolutePath(href);
tagopts.put("href", url.toNormalform(true));
url.getProperties().putAll(tagopts);
url.setAll(tagopts);
this.anchors.add(url);
}
} else if (tagname.equalsIgnoreCase("link")) {
@ -401,7 +401,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.getProperties().putAll(tagopts);
newLink.setAll(tagopts);
this.anchors.add(newLink);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("publisher")) {
@ -417,7 +417,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
tagopts.put("name", linktitle);
newLink.getProperties().putAll(tagopts);
newLink.setAll(tagopts);
this.anchors.add(newLink);
}
}
@ -432,7 +432,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
tagopts.put("src", url.toNormalform(true));
final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
url.getProperties().putAll(tagopts);
url.setAll(tagopts);
this.anchors.add(url);
}
}
@ -442,13 +442,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (name.equalsIgnoreCase("movie")) {
AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
tagopts.put("value", url.toNormalform(true));
url.getProperties().putAll(tagopts);
url.setAll(tagopts);
this.anchors.add(url);
}
} else if (tagname.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
src.getProperties().putAll(tagopts);
src.setAll(tagopts);
this.anchors.add(src);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
@ -475,9 +475,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
this.images.add(ie);
} else {
tagopts.put("text", recursiveParse(url, text));
tagopts.put("text", new String(text));
tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.getProperties().putAll(tagopts);
url.setAll(tagopts);
recursiveParse(url, text);
this.anchors.add(url);
}
}
@ -541,7 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
}
private String recursiveParse(final DigestURL linkurl, final char[] inlineHtml) {
private String recursiveParse(final AnchorURL linkurl, final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml)));
// start a new scraper to parse links inside this text

@ -26,12 +26,12 @@ package net.yacy.document.parser.html;
import java.util.Comparator;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
private final DigestURL imageurl;
private DigestURL linkurl;
private final AnchorURL imageurl;
private AnchorURL linkurl;
private final String alt;
private String anchortext;
private final int width, height;
@ -49,7 +49,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
* @param fileSize the number of bytes that the image uses on file or -1 if unknown
*/
public ImageEntry(
final DigestURL imageurl,
final AnchorURL imageurl,
final String alt,
final int width,
final int height,
@ -64,15 +64,15 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
this.fileSize = fileSize;
}
public DigestURL url() {
public AnchorURL url() {
return this.imageurl;
}
public void setLinkurl(DigestURL linkurl) {
public void setLinkurl(AnchorURL linkurl) {
this.linkurl = linkurl;
}
public DigestURL linkurl() {
public AnchorURL linkurl() {
return this.linkurl;
}

@ -35,6 +35,7 @@ import java.nio.charset.UnsupportedCharsetException;
import java.util.LinkedHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
@ -88,7 +89,7 @@ public class htmlParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final DigestURL location,
final AnchorURL location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
@ -126,7 +127,7 @@ public class htmlParser extends AbstractParser implements Parser {
sections[p++] = headline;
}
}
LinkedHashMap<DigestURL, ImageEntry> noDoubleImages = new LinkedHashMap<DigestURL, ImageEntry>();
LinkedHashMap<AnchorURL, ImageEntry> noDoubleImages = new LinkedHashMap<AnchorURL, ImageEntry>();
for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie);
final Document ppd = new Document(
location,
@ -301,9 +302,9 @@ public class htmlParser extends AbstractParser implements Parser {
public static void main(final String[] args) {
// test parsing of a url
DigestURL url;
AnchorURL url;
try {
url = new DigestURL(args[0]);
url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent);
final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final String title = document[0].dc_title();

@ -46,7 +46,6 @@ import java.util.Set;
import javax.imageio.ImageIO;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -95,7 +94,7 @@ public class genericImageParser extends AbstractParser implements Parser {
@Override
public Document[] parse(
final DigestURL location,
final AnchorURL location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
@ -199,7 +198,7 @@ public class genericImageParser extends AbstractParser implements Parser {
final HashSet<String> languages = new HashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<DigestURL, ImageEntry>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
// add this image to the map of images
final String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
@ -238,7 +237,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static ImageInfo parseJavaImage(
final DigestURL location,
final AnchorURL location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
try {
@ -253,7 +252,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static ImageInfo parseJavaImage(
final DigestURL location,
final AnchorURL location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
ii.image = image;
@ -290,12 +289,12 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static class ImageInfo {
public DigestURL location;
public AnchorURL location;
public BufferedImage image;
public StringBuilder info;
public int height;
public int width;
public ImageInfo(final DigestURL location) {
public ImageInfo(final AnchorURL location) {
this.location = location;
this.image = null;
this.info = new StringBuilder();
@ -309,9 +308,9 @@ public class genericImageParser extends AbstractParser implements Parser {
public static void main(final String[] args) {
final File image = new File(args[0]);
final genericImageParser parser = new genericImageParser();
DigestURL uri;
AnchorURL uri;
try {
uri = new DigestURL("http://localhost/" + image.getName());
uri = new AnchorURL("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image));
System.out.println(document[0].toString());
} catch (final MalformedURLException e) {

@ -35,7 +35,7 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -70,7 +70,7 @@ public class mmParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException
{

@ -43,6 +43,7 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -214,7 +215,7 @@ public class odtParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

@ -43,7 +43,7 @@ import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -91,7 +91,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
return parser;
}
private Document[] parse(final DigestURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException {
private Document[] parse(final AnchorURL location, final String mimeType, @SuppressWarnings("unused") final String charset, final File dest) throws Parser.Failure, InterruptedException {
CharBuffer writer = null;
try {
@ -201,7 +201,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

@ -55,7 +55,7 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))

@ -31,7 +31,7 @@ import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -60,7 +60,7 @@ public class pptParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
InterruptedException {
try {

@ -36,6 +36,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -256,7 +257,7 @@ public class psParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {

@ -30,7 +30,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -45,7 +45,7 @@ public class rdfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType,
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {

@ -17,6 +17,7 @@ import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -46,7 +47,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(DigestURL url, String mimeType,
public Document[] parse(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure,
InterruptedException {
@ -95,7 +96,7 @@ public class RDFaParser extends AbstractParser implements Parser {
return doc;
}
private Document[] parseHtml(DigestURL url, String mimeType,
private Document[] parseHtml(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure,
InterruptedException {
@ -178,7 +179,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new DigestURL(args[0]),"","",aURL.openStream());
aParser.parse(new AnchorURL(args[0]),"","",aURL.openStream());
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {

@ -38,7 +38,6 @@ import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -58,7 +57,7 @@ public class rssParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType,
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
@ -80,7 +79,7 @@ public class rssParser extends AbstractParser implements Parser {
languages = new HashSet<String>();
languages.add(item.getLanguage());
anchors = new ArrayList<AnchorURL>();
uri.getProperties().put("name", item.getTitle());
uri.setNameProperty(item.getTitle());
anchors.add(uri);
doc = new Document(
uri,
@ -99,7 +98,7 @@ public class rssParser extends AbstractParser implements Parser {
null,
anchors,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
new LinkedHashMap<AnchorURL, ImageEntry>(),
false,
item.getPubDate());
docs.add(doc);

@ -33,7 +33,7 @@ import java.util.Date;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -52,7 +52,7 @@ public class rtfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {

@ -34,7 +34,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -55,7 +55,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public Document parse(final DigestURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
public Document parse(final AnchorURL location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
final Document doc = new Document(
location,
mimeType,
@ -100,12 +100,12 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
}
public Document parse(final DigestURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException {
public Document parse(final AnchorURL location, final String mimeType, final String charset, final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source));
}
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
@ -169,7 +169,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final DigestURL url = DigestURL.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());

@ -31,7 +31,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -57,7 +57,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
try {

@ -40,6 +40,7 @@ import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -68,7 +69,7 @@ public class sitemapParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType,
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>();
@ -95,7 +96,7 @@ public class sitemapParser extends AbstractParser implements Parser {
null,
null,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
new LinkedHashMap<AnchorURL, ImageEntry>(),
false,
new Date());
docs.add(doc);

@ -34,7 +34,6 @@ import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -56,7 +55,7 @@ public class swfParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException
{
@ -97,7 +96,7 @@ public class swfParser extends AbstractParser implements Parser {
url = contents.substring(urlStart,urlEnd);
urlnr = Integer.toString(++urls).toString();
AnchorURL u = new AnchorURL(url);
u.getProperties().put("name", urlnr);
u.setNameProperty(urlnr);
anchors.add(u);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}

@ -34,7 +34,7 @@ import java.util.List;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -61,7 +61,7 @@ public class tarParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(DigestURL.newURL(url, "#" + name), mime, null, tmp);
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -33,7 +33,7 @@ import java.util.List;
import java.util.Map;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Condenser;
@ -56,7 +56,7 @@ public class torrentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(DigestURL location, String mimeType, String charset, InputStream source)
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source)
throws Parser.Failure, InterruptedException {
byte[] b = null;
try {
@ -119,7 +119,7 @@ public class torrentParser extends AbstractParser implements Parser {
try {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new DigestURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);

@ -41,7 +41,6 @@ import java.util.List;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -65,7 +64,7 @@ public class vcfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType, final String charset, final InputStream source)
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
try {
@ -180,7 +179,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else if (key.toUpperCase().startsWith("URL")) {
try {
final AnchorURL newURL = new AnchorURL(value);
newURL.getProperties().put("name", newURL.toString());
newURL.setNameProperty(newURL.toString());
anchors.add(newURL);
//parsedData.put(key,value);
} catch (final MalformedURLException ex) {/* ignore this */}

@ -32,7 +32,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -66,7 +66,7 @@ public class vsdParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset, final InputStream source)
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
Document theDoc = null;

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -66,7 +67,7 @@ public class xlsParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURL location, final String mimeType,
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);

@ -32,6 +32,7 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -60,7 +61,7 @@ public class zipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final DigestURL url, final String mimeType,
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
// check memory for parser
@ -88,7 +89,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, tmp);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -47,6 +47,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.sorting.ClusteredScoreMap;
@ -159,8 +160,8 @@ public class WebStructureGraph {
public void generateCitationReference(final DigestURL url, final Document document) {
// generate citation reference
final Map<DigestURL, String> hl = document.getHyperlinks();
final Iterator<DigestURL> it = hl.keySet().iterator();
final Map<AnchorURL, String> hl = document.getHyperlinks();
final Iterator<AnchorURL> it = hl.keySet().iterator();
final HashSet<DigestURL> globalRefURLs = new HashSet<DigestURL>();
final String refhost = url.getHost();
DigestURL u;

@ -40,6 +40,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
@ -384,7 +385,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<DigestURL, String> loadLinks(final DigestURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
public final Map<DigestURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();

@ -94,6 +94,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
@ -2480,7 +2481,7 @@ public final class Switchboard extends serverSwitch {
// parse the document
documents =
TextParser.parseSource(
response.url(),
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.getContent());
@ -3408,9 +3409,9 @@ public final class Switchboard extends serverSwitch {
}
// get the links for a specific site
DigestURL url;
AnchorURL url;
try {
url = new DigestURL(r);
url = new AnchorURL(r);
} catch (final MalformedURLException e ) {
ConcurrentLog.logException(e);
return;
@ -3447,9 +3448,9 @@ public final class Switchboard extends serverSwitch {
public void run() {
// get the links for a specific site
final DigestURL startUrl;
final AnchorURL startUrl;
try {
startUrl = new DigestURL(url);
startUrl = new AnchorURL(url);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
return;

@ -34,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
@ -52,14 +53,14 @@ import net.yacy.search.schema.WebgraphConfiguration;
*/
public class DocumentIndex extends Segment {
private static DigestURL poison;
private static AnchorURL poison;
static {
try {
poison = new DigestURL("file://.");
poison = new AnchorURL("file://.");
} catch (final MalformedURLException e ) {
}
}
BlockingQueue<DigestURL> queue; // a queue of document ID's
BlockingQueue<AnchorURL> queue; // a queue of document ID's
private final Worker[] worker;
CallbackListener callback;
@ -80,7 +81,7 @@ public class DocumentIndex extends Segment {
super.fulltext().connectLocalSolr();
super.fulltext().writeWebgraph(true);
this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURL>(WorkflowProcessor.availableCPU * 300);
this.queue = new LinkedBlockingQueue<AnchorURL>(WorkflowProcessor.availableCPU * 300);
this.worker = new Worker[WorkflowProcessor.availableCPU];
for ( int i = 0; i < WorkflowProcessor.availableCPU; i++ ) {
this.worker[i] = new Worker(i);
@ -96,7 +97,7 @@ public class DocumentIndex extends Segment {
@Override
public void run() {
DigestURL f;
AnchorURL f;
SolrInputDocument[] resultRows;
try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) {
@ -134,7 +135,7 @@ public class DocumentIndex extends Segment {
this.queue.clear();
}
private SolrInputDocument[] add(final DigestURL url) throws IOException {
private SolrInputDocument[] add(final AnchorURL url) throws IOException {
if ( url == null ) {
throw new IOException("file = null");
}
@ -183,7 +184,7 @@ public class DocumentIndex extends Segment {
*
* @param start
*/
public void addConcurrent(final DigestURL start) throws IOException {
public void addConcurrent(final AnchorURL start) throws IOException {
assert (start != null);
assert (start.canRead()) : start.toString();
if ( !start.isDirectory() ) {
@ -194,10 +195,10 @@ public class DocumentIndex extends Segment {
return;
}
final String[] s = start.list();
DigestURL w;
AnchorURL w;
for ( final String t : s ) {
try {
w = new DigestURL(start, t);
w = new AnchorURL(start, t);
if ( w.canRead() && !w.isHidden() ) {
if ( w.isDirectory() ) {
addConcurrent(w);

@ -34,7 +34,6 @@ import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
@ -120,15 +119,14 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final IndexCell<CitationReference> citations) {
boolean allAttr = this.isEmpty();
int target_order = 0;
for (final AnchorURL target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final Properties p = target_url.getProperties();
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
final String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
// index organization
@ -140,6 +138,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
add(edge, WebgraphSchema.target_order_i, target_order++);
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();

@ -72,6 +72,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
// target information
target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"),
target_order_i(SolrType.num_integer, true, true, false, false, false, "order number of target url, a count from first to last URL on the source page (target)"),
target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"),
target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"),
target_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (target)"),

@ -38,6 +38,7 @@ import java.util.TreeSet;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
@ -164,15 +165,15 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
public static List<MediaSnippet> computeMediaSnippets(final DigestURL source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) {
if (document == null) return new ArrayList<MediaSnippet>();
Map<DigestURL, String> media = null;
Map<AnchorURL, String> media = null;
if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks();
else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks();
else if (mediatype == ContentDomain.APP) media = document.getApplinks();
if (media == null) return null;
final Iterator<Map.Entry<DigestURL, String>> i = media.entrySet().iterator();
Map.Entry<DigestURL, String> entry;
DigestURL url;
final Iterator<Map.Entry<AnchorURL, String>> i = media.entrySet().iterator();
Map.Entry<AnchorURL, String> entry;
AnchorURL url;
String desc;
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {

Loading…
Cancel
Save