- the webgraph shall store all links which appear on a web page and not

all unique links! This made it necessary, that a large portion of the
parser and link processing classes must be adopted to carry a different
type of link collection which carry a property attribute which are
attached to web anchors.
- introduction of a new URL class, AnchorURL
- the other url classes, DigestURI and MultiProtocolURI had been renamed
and refactored to fit into a new document package schema, document.id
- cleanup of net.yacy.cora.document package and refactoring
pull/1/head
Michael Peter Christen 11 years ago
parent 1a8c64117f
commit 5e31bad711

1
.gitignore vendored

@ -12,3 +12,4 @@ RELEASE/
lib/yacy-cora.jar
/DATA.bkp
/DATA.1
/gen

@ -31,8 +31,8 @@
import java.net.MalformedURLException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@ -55,9 +55,9 @@ public class BlacklistTest_p {
!urlstring.startsWith("ftp://") &&
!urlstring.startsWith("smb://") &&
!urlstring.startsWith("file://")) urlstring = "http://" + urlstring;
DigestURI testurl = null;
DigestURL testurl = null;
try {
testurl = new DigestURI(urlstring);
testurl = new DigestURL(urlstring);
} catch (final MalformedURLException e) {
testurl = null;
}

@ -35,12 +35,12 @@ import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType;
@ -81,9 +81,9 @@ public class Blacklist_p {
!urlstring.startsWith("file://")) {
urlstring = "http://"+urlstring;
}
DigestURI testurl;
DigestURL testurl;
try {
testurl = new DigestURI(urlstring);
testurl = new DigestURL(urlstring);
} catch (final MalformedURLException e) {
testurl = null;
}

@ -38,7 +38,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;

@ -35,7 +35,7 @@ import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;

@ -39,7 +39,8 @@ import java.util.Map;
import java.util.Set;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
@ -53,7 +54,6 @@ import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
@ -247,7 +247,7 @@ public class Bookmarks {
try {
final File file = new File(post.get("htmlfile"));
BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURI(file), post.get("htmlfile$file"), tags, isPublic);
BookmarkHelper.importFromBookmarks(sb.bookmarksDB, new DigestURL(file), post.get("htmlfile$file"), tags, isPublic);
} catch (final MalformedURLException e) {}
ConcurrentLog.info("BOOKMARKS", "success!!");

@ -24,13 +24,13 @@
import java.net.MalformedURLException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
@ -44,9 +44,9 @@ public class CacheResource_p {
if (post == null) return prop;
final String u = post.get("url", "");
DigestURI url;
DigestURL url;
try {
url = new DigestURI(u);
url = new DigestURL(u);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
return prop;

@ -24,7 +24,7 @@
import java.util.Random;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.ResultImages;
@ -89,8 +89,8 @@ public class Collage {
final int yOffset = embed ? 0 : 70;
for (int i = 0; i < fifoSize; i++) {
final MultiProtocolURI baseURL = origins[i].baseURL;
final MultiProtocolURI imageURL = origins[i].imageEntry.url();
final MultiProtocolURL baseURL = origins[i].baseURL;
final MultiProtocolURL imageURL = origins[i].imageEntry.url();
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted

@ -39,9 +39,9 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@ -101,7 +101,7 @@ public class ConfigAppearance_p {
final Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
final DigestURL u = new DigestURL(url);
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch (final IOException e) {
prop.put("status", "1");// unable to get URL

@ -239,7 +239,7 @@ public class ConfigHeuristics_p {
// re-read config (and create/update work table)
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
OpenSearchConnector os = new OpenSearchConnector(sb, true);
new OpenSearchConnector(sb, true);
}
}
}

@ -39,11 +39,11 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.Translator;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -100,7 +100,7 @@ public class ConfigLanguage_p {
final String url = post.get("url");
Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
final DigestURL u = new DigestURL(url);
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch(final IOException e) {
prop.put("status", "1");//unable to get url

@ -30,7 +30,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.util.FileUtils;

@ -30,10 +30,11 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Properties;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
@ -98,7 +99,7 @@ public class ConfigPortal {
String excludehosts = post.get("search.excludehosts", "");
sb.setConfig("search.excludehosts", excludehosts);
sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts));
sb.setConfig("search.excludehosth", DigestURL.hosthashes(excludehosts));
}
if (post.containsKey("searchpage_default")) {
// load defaults from defaults/yacy.init file

@ -32,9 +32,9 @@ import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.operation.yacyBuildProperties;
@ -86,7 +86,7 @@ public class ConfigUpdate_p {
final String release = post.get("releasedownload", "");
if (!release.isEmpty()) {
try {
yacyRelease versionToDownload = new yacyRelease(new DigestURI(release));
yacyRelease versionToDownload = new yacyRelease(new DigestURL(release));
// replace this version with version which contains public key
final yacyRelease.DevAndMainVersions allReleases = yacyRelease.allReleases(false, false);

@ -24,6 +24,7 @@ import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -31,7 +32,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -49,7 +49,7 @@ public class CrawlCheck_p {
if (post.containsKey("crawlcheck")) {
// get the list of rootURls for this crawl start
Set<DigestURI> rootURLs = new HashSet<DigestURI>();
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlingStart0 = post.get("crawlingURLs","").trim();
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
for (String crawlingStart: rootURLs0) {
@ -61,7 +61,7 @@ public class CrawlCheck_p {
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
}
try {
DigestURI crawlingStartURL = new DigestURI(crawlingStart);
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
@ -78,7 +78,7 @@ public class CrawlCheck_p {
// and analyze the urls to make the table rows
StringBuilder s = new StringBuilder(300);
int row = 0;
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
s.append(u.toNormalform(true)).append('\n');
prop.put("table_list_" + row + "_url", u.toNormalform(true));

@ -31,8 +31,8 @@ import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;

@ -31,6 +31,7 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
@ -40,7 +41,6 @@ import net.yacy.cora.protocol.Scanner.Access;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
@ -193,10 +193,10 @@ public class CrawlStartScanner_p
if ( post.containsKey("crawl") ) {
// make a pk/url mapping
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
final Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
final Map<byte[], DigestURL> pkmap = new TreeMap<byte[], DigestURL>(Base64Order.enhancedCoder);
while (se.hasNext()) {
final Scanner.Service u = se.next().getKey();
DigestURI uu;
DigestURL uu;
try {
uu = u.url();
pkmap.put(uu.hash(), uu);
@ -208,7 +208,7 @@ public class CrawlStartScanner_p
for ( final Map.Entry<String, String> entry : post.entrySet() ) {
if ( entry.getValue().startsWith("mark_") ) {
final byte[] pk = entry.getValue().substring(5).getBytes();
final DigestURI url = pkmap.get(pk);
final DigestURL url = pkmap.get(pk);
if ( url != null ) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99&directDocByURL=off";
path += "&crawlingURL=" + url.toNormalform(true);
@ -244,7 +244,7 @@ public class CrawlStartScanner_p
final Map<byte[], String> apiCommentCache = WorkTables.commentCache(sb);
String urlString;
DigestURI u;
DigestURL u;
try {
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<Scanner.Service, Scanner.Access> host;

@ -30,12 +30,13 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -50,7 +51,6 @@ import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.NewsPool;
@ -175,7 +175,7 @@ public class Crawler_p {
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
Set<DigestURI> rootURLs = new HashSet<DigestURI>();
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
String crawlName = "";
if (crawlingFile == null) for (String crawlingStart: rootURLs0) {
if (crawlingStart == null || crawlingStart.length() == 0) continue;
@ -185,7 +185,7 @@ public class Crawler_p {
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart;
}
try {
DigestURI crawlingStartURL = new DigestURI(crawlingStart);
DigestURL crawlingStartURL = new DigestURL(crawlingStart);
rootURLs.add(crawlingStartURL);
crawlName += ((crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true) : crawlingStartURL.getHost()) + ',';
if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
@ -288,14 +288,14 @@ public class Crawler_p {
if ("sitelist".equals(crawlingMode)) {
newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING;
Set<DigestURI> newRootURLs = new HashSet<DigestURI>();
for (DigestURI sitelistURL: rootURLs) {
Set<DigestURL> newRootURLs = new HashSet<DigestURL>();
for (DigestURL sitelistURL: rootURLs) {
// download document
Document scraper;
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent);
// get links and generate filter
for (DigestURI u: scraper.getAnchors().keySet()) {
for (DigestURL u: scraper.getAnchors()) {
newRootURLs.add(u);
}
} catch (final IOException e) {
@ -313,14 +313,14 @@ public class Crawler_p {
if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate);
}
}
} else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
String basepath = u.toNormalform(true);
if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
int count = sb.index.fulltext().remove(basepath, deleteageDate);
@ -339,7 +339,7 @@ public class Crawler_p {
// check if the crawl filter works correctly
try {
Pattern mmp = Pattern.compile(newcrawlingMustMatch);
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
assert mmp.matcher(u.toNormalform(true)).matches() : "pattern " + mmp.toString() + " does not match url " + u.toNormalform(true);
}
} catch (final PatternSyntaxException e) {
@ -389,7 +389,7 @@ public class Crawler_p {
// delete all error urls for that domain
List<byte[]> hosthashes = new ArrayList<byte[]>();
for (DigestURI u: rootURLs) {
for (DigestURL u: rootURLs) {
hosthashes.add(ASCII.getBytes(u.hosthash()));
}
sb.crawlQueues.errorURL.removeHosts(hosthashes, false);
@ -411,8 +411,8 @@ public class Crawler_p {
// stack requests
sb.crawler.putActive(handle, profile);
final Set<DigestURI> successurls = new HashSet<DigestURI>();
final Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
final Set<DigestURL> successurls = new HashSet<DigestURL>();
final Map<DigestURL,String> failurls = new HashMap<DigestURL, String>();
sb.stackURLs(rootURLs, profile, successurls, failurls);
if (failurls.size() == 0) {
@ -439,7 +439,7 @@ public class Crawler_p {
}
} else {
StringBuilder fr = new StringBuilder();
for (Map.Entry<DigestURI, String> failure: failurls.entrySet()) {
for (Map.Entry<DigestURL, String> failure: failurls.entrySet()) {
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
@ -470,7 +470,7 @@ public class Crawler_p {
} else if ("sitemap".equals(crawlingMode)) {
final String sitemapURLStr = post.get("sitemapURL","");
try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
final DigestURL sitemapURL = new DigestURL(sitemapURLStr);
sb.crawler.putActive(handle, profile);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
importer.start();
@ -488,7 +488,7 @@ public class Crawler_p {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile), 10000000);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);
@ -498,12 +498,12 @@ public class Crawler_p {
writer.close();
// get links and generate filter
final Map<DigestURI, Properties> hyperlinks = scraper.getAnchors();
final List<AnchorURL> hyperlinks = scraper.getAnchors();
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet());
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks);
} else if (subPath) {
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks.keySet());
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks);
}
}

@ -21,6 +21,7 @@
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation;
@ -29,7 +30,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -66,7 +66,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -108,7 +108,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -150,7 +150,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -192,7 +192,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -235,7 +235,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo();
@ -279,7 +279,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final Response response = sb.loader.load(sb.loader.request(new DigestURL(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND();

@ -33,9 +33,10 @@ import java.util.concurrent.BlockingQueue;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader;
@ -46,7 +47,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
import net.yacy.search.Switchboard;
@ -115,8 +115,8 @@ public class HostBrowser {
prop.putHTML("path", path);
prop.put("delete", admin && path.length() > 0 ? 1 : 0);
DigestURI pathURI = null;
try {pathURI = new DigestURI(path);} catch (final MalformedURLException e) {}
DigestURL pathURI = null;
try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {}
String load = post.get("load", "");
boolean wait = false;
@ -127,10 +127,10 @@ public class HostBrowser {
}
if (load.length() > 0 && loadRight) {
// stack URL
DigestURI url;
DigestURL url;
if (sb.crawlStacker.size() > 2) wait = false;
try {
url = new DigestURI(load);
url = new DigestURL(load);
String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(),
@ -244,7 +244,7 @@ public class HostBrowser {
}
try {
// generate file list from path
DigestURI uri = new DigestURI(path);
DigestURL uri = new DigestURL(path);
String host = uri.getHost();
prop.putHTML("outbound_host", host);
if (admin) prop.putHTML("outbound_admin_host", host); //used for WebStructurePicture_p link
@ -322,7 +322,7 @@ public class HostBrowser {
while (links.hasNext()) {
u = links.next();
try {
MultiProtocolURI mu = new MultiProtocolURI(u);
MultiProtocolURL mu = new MultiProtocolURL(u);
if (mu.getHost() != null) {
ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
if (lks == null) {
@ -422,7 +422,7 @@ public class HostBrowser {
prop.put("files_list_" + c + "_type", 0);
prop.put("files_list_" + c + "_type_url", entry.getKey());
StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURI(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash());
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
@ -541,12 +541,12 @@ public class HostBrowser {
// get all urls from the index and store them here
for (String id: internalIDs) {
if (id.equals(urlhash)) continue; // no self-references
DigestURI u = fulltext.getURL(ASCII.getBytes(id));
DigestURL u = fulltext.getURL(ASCII.getBytes(id));
if (u != null) references_internal_urls.add(u.toNormalform(true));
}
for (String id: externalIDs) {
if (id.equals(urlhash)) continue; // no self-references
DigestURI u = fulltext.getURL(ASCII.getBytes(id));
DigestURL u = fulltext.getURL(ASCII.getBytes(id));
if (u != null) references_external_urls.add(u.toNormalform(true));
}
} catch (final IOException e) {

@ -31,8 +31,9 @@ import java.util.Iterator;
import java.util.List;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -42,7 +43,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -368,7 +368,7 @@ public class IndexControlRWIs_p {
if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes =
env.getConfig("BlackLists.types", "").split(",");
DigestURI url;
DigestURL url;
for ( final byte[] b : urlb ) {
try {
urlHashes.put(b);
@ -395,7 +395,7 @@ public class IndexControlRWIs_p {
}
if ( post.containsKey("blacklistdomains") ) {
DigestURI url;
DigestURL url;
for ( final byte[] b : urlb ) {
try {
urlHashes.put(b);
@ -461,7 +461,7 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64());
prop.put("genUrlList_lines", maxlines);
int i = 0;
DigestURI url;
DigestURL url;
URIMetadataNode entry;
String us;
long rn = -1;
@ -483,7 +483,7 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_urlList_" + i + "_urlExists_urlStringShort",
(us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_domlength", DigestURL.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_" + i + "_urlExists_tf", 1000.0 * entry.word().termFrequency());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_authority", (theSearch.getOrder() == null) ? -1 : theSearch.getOrder().authority(ASCII.String(entry.hash(), 6, 6)));
prop.put("genUrlList_urlList_" + i + "_urlExists_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified())));
@ -627,7 +627,7 @@ public class IndexControlRWIs_p {
filter,
false,
null,
DigestURI.TLD_any_zone_filter,
DigestURL.TLD_any_zone_filter,
"",
false,
sb.index,

@ -33,7 +33,8 @@ import java.util.List;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.ClientIdentification;
@ -43,7 +44,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.Switchboard;
@ -127,7 +127,7 @@ public class IndexControlURLs_p {
String urlhash = post.get("urlhash", "").trim();
if (urlhash.isEmpty() && urlstring.length() > 0) {
try {
urlhash = ASCII.String(new DigestURI(urlstring).hash());
urlhash = ASCII.String(new DigestURL(urlstring).hash());
} catch (final MalformedURLException e) {
}
}
@ -184,7 +184,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final DigestURI url = segment.fulltext().getURL(ASCII.getBytes(urlhash));
final DigestURL url = segment.fulltext().getURL(ASCII.getBytes(urlhash));
if (url == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -197,7 +197,7 @@ public class IndexControlURLs_p {
if (post.containsKey("urldelete")) {
try {
urlhash = ASCII.String((new DigestURI(urlstring)).hash());
urlhash = ASCII.String((new DigestURL(urlstring)).hash());
} catch (final MalformedURLException e) {
urlhash = null;
}
@ -211,7 +211,7 @@ public class IndexControlURLs_p {
if (post.containsKey("urlstringsearch")) {
try {
final DigestURI url = new DigestURI(urlstring);
final DigestURL url = new DigestURL(urlstring);
urlhash = ASCII.String(url.hash());
prop.put("urlhash", urlhash);
final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));

@ -27,7 +27,7 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.peers.Seed;

@ -26,11 +26,11 @@
import java.util.ArrayList;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.CrawlStacker;
import net.yacy.crawler.data.ZURL;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -72,7 +72,7 @@ public class IndexCreateParserErrors_p {
prop.put("rejected_only-latest", "0");
}
dark = true;
DigestURI url;
DigestURL url;
byte[] initiatorHash, executorHash;
Seed initiatorSeed, executorSeed;
int j=0;

@ -9,7 +9,7 @@ import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;

@ -30,12 +30,12 @@ import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.query.QueryModifier;
import net.yacy.search.schema.CollectionSchema;
@ -129,7 +129,7 @@ public class IndexDeletion_p {
if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub; else urlStub = "http://" + urlStub;
}
try {
DigestURI u = new DigestURI(urlStub);
DigestURL u = new DigestURL(urlStub);
BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", 0, 100000000, Long.MAX_VALUE, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
SolrDocument doc;
try {

@ -26,7 +26,7 @@ import java.util.ArrayList;
import org.apache.solr.common.SolrException;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;

@ -31,6 +31,7 @@ import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -38,7 +39,6 @@ import net.yacy.data.WorkTables;
import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.document.importer.OAIPMHLoader;
import net.yacy.document.importer.ResumptionToken;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -59,9 +59,9 @@ public class IndexImportOAIPMH_p {
if (post.containsKey("urlstartone")) {
String oaipmhurl = post.get("urlstartone");
if (oaipmhurl.indexOf('?',0) < 0) oaipmhurl = oaipmhurl + "?verb=ListRecords&metadataPrefix=oai_dc";
DigestURI url = null;
DigestURL url = null;
try {
url = new DigestURI(oaipmhurl);
url = new DigestURL(oaipmhurl);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, agent);
final ResumptionToken rt = r.getResumptionToken();
@ -72,7 +72,7 @@ public class IndexImportOAIPMH_p {
// set next default url
try {
final DigestURI nexturl = (rt == null) ? null : rt.resumptionURL();
final DigestURL nexturl = (rt == null) ? null : rt.resumptionURL();
if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true));
} catch (final MalformedURLException e) {
prop.put("defaulturl", e.getMessage());
@ -94,9 +94,9 @@ public class IndexImportOAIPMH_p {
if (post.get("urlstart", "").length() > 0) {
final String oaipmhurl = post.get("urlstart", "");
sb.tables.recordAPICall(post, "IndexImportOAIPMH_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "OAI-PMH import for " + oaipmhurl);
DigestURI url = null;
DigestURL url = null;
try {
url = new DigestURI(oaipmhurl);
url = new DigestURL(oaipmhurl);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url);
job.start();
@ -129,12 +129,12 @@ public class IndexImportOAIPMH_p {
final Random r = new Random(System.currentTimeMillis());
// start jobs for the sources
DigestURI url = null;
DigestURL url = null;
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
while (!sourceList.isEmpty()) {
final String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size()));
try {
url = new DigestURI(oaipmhurl);
url = new DigestURL(oaipmhurl);
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url);
job.start();
} catch (final MalformedURLException e) {

@ -29,12 +29,13 @@ import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -48,7 +49,6 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
@ -169,9 +169,9 @@ public class Load_RSS_p {
ConcurrentLog.logException(e);
continue;
}
DigestURI url = null;
DigestURL url = null;
try {
url = new DigestURI(row.get("url", ""));
url = new DigestURL(row.get("url", ""));
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "malformed url '" + row.get("url", "") + "': " + e.getMessage());
continue;
@ -203,7 +203,7 @@ public class Load_RSS_p {
messageurl = row.get("url", "");
if (messageurl.isEmpty()) continue;
// get referrer
final DigestURI referrer = sb.getURL(row.get("referrer", "").getBytes());
final DigestURL referrer = sb.getURL(row.get("referrer", "").getBytes());
// check if feed is registered in scheduler
final byte[] api_pk = row.get("api_pk");
final Row r = api_pk == null ? null : sb.tables.select("api", api_pk);
@ -257,9 +257,9 @@ public class Load_RSS_p {
boolean record_api = false;
DigestURI url = null;
DigestURL url = null;
try {
url = post.containsKey("url") ? new DigestURI(post.get("url", "")) : null;
url = post.containsKey("url") ? new DigestURL(post.get("url", "")) : null;
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Load_RSS_p", "url not well-formed: '" + post.get("url", "") + "'");
}
@ -280,12 +280,12 @@ public class Load_RSS_p {
// index all selected items: description only
if (rss != null && post.containsKey("indexSelectedItemContent")) {
final RSSFeed feed = rss.getFeed();
List<DigestURI> list = new ArrayList<DigestURI>();
List<DigestURL> list = new ArrayList<DigestURL>();
Map<String, RSSMessage> messages = new HashMap<String, RSSMessage>();
loop: for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
final RSSMessage message = feed.getMessage(entry.getValue().substring(5));
final DigestURI messageurl = new DigestURI(message.getLink());
final DigestURL messageurl = new DigestURL(message.getLink());
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
messages.put(ASCII.String(messageurl.hash()), message);
} catch (final IOException e) {
@ -296,7 +296,7 @@ public class Load_RSS_p {
loop: for (final Map.Entry<String, RSSMessage> entry: messages.entrySet()) {
try {
final RSSMessage message = entry.getValue();
final DigestURI messageurl = new DigestURI(message.getLink());
final DigestURL messageurl = new DigestURL(message.getLink());
if (existingurls.get(ASCII.String(messageurl.hash())) != null) continue loop;
list.add(messageurl);
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
@ -334,10 +334,10 @@ public class Load_RSS_p {
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
prop.putHTML("showitems_docs", channel == null ? "" : channel.getDocs());
Map<String, DigestURI> urls = new HashMap<String, DigestURI>();
Map<String, DigestURL> urls = new HashMap<String, DigestURL>();
for (final Hit item: feed) {
try {
final DigestURI messageurl = new DigestURI(item.getLink());
final DigestURL messageurl = new DigestURL(item.getLink());
urls.put(ASCII.String(messageurl.hash()), messageurl);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
@ -349,7 +349,7 @@ public class Load_RSS_p {
int i = 0;
for (final Hit item: feed) {
try {
final DigestURI messageurl = new DigestURI(item.getLink());
final DigestURL messageurl = new DigestURL(item.getLink());
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();

@ -30,7 +30,7 @@ import java.util.Date;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;

@ -32,7 +32,7 @@ import java.util.Iterator;
import java.util.Locale;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.MessageBoard;

@ -32,7 +32,8 @@
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@ -41,7 +42,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.NumberTools;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.server.serverObjects;
@ -112,12 +112,12 @@ public class QuickCrawlLink_p {
if (crawlingStart != null) {
crawlingStart = crawlingStart.trim();
try {crawlingStart = new DigestURI(crawlingStart).toNormalform(true);} catch (final MalformedURLException e1) {}
try {crawlingStart = new DigestURL(crawlingStart).toNormalform(true);} catch (final MalformedURLException e1) {}
// check if url is proper
DigestURI crawlingStartURL = null;
DigestURL crawlingStartURL = null;
try {
crawlingStartURL = new DigestURI(crawlingStart);
crawlingStartURL = new DigestURL(crawlingStart);
} catch (final MalformedURLException e) {
prop.put("mode_status", "1");
prop.put("mode_code", "1");

@ -23,13 +23,13 @@ import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.protocol.Scanner.Access;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -54,7 +54,7 @@ public class ServerScannerList {
// show scancache table
prop.put("servertable", 1);
String urlString;
DigestURI u;
DigestURL u;
table: while (true) {
try {
int i = 0;

@ -35,7 +35,7 @@ import java.util.StringTokenizer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.Domains;
@ -62,7 +62,7 @@ public class SettingsAck_p {
final Switchboard sb = (Switchboard) env;
// get referer for backlink
final MultiProtocolURI referer = header.referer();
final MultiProtocolURL referer = header.referer();
prop.put("referer", (referer == null) ? "Settings_p.html" : referer.toNormalform(true));
//if (post == null) System.out.println("POST: NULL"); else System.out.println("POST: " + post.toString());

@ -31,13 +31,13 @@ import java.util.HashMap;
import java.util.Iterator;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.peers.NewsDB;
@ -129,7 +129,7 @@ public class Supporter {
url = row.getPrimaryKeyUTF8().trim();
try {
if (Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS, new DigestURI(url, urlhash.getBytes()))) continue;
if (Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS, new DigestURL(url, urlhash.getBytes()))) continue;
} catch (final MalformedURLException e) {
continue;
}
@ -247,13 +247,13 @@ public class Supporter {
// add/subtract votes and write record
if (entry != null) {
try {
urlhash = ASCII.String((new DigestURI(url)).hash());
urlhash = ASCII.String((new DigestURL(url)).hash());
} catch (final MalformedURLException e) {
urlhash = null;
}
if (urlhash == null)
try {
urlhash = ASCII.String((new DigestURI("http://" + url)).hash());
urlhash = ASCII.String((new DigestURL("http://" + url)).hash());
} catch (final MalformedURLException e) {
urlhash = null;
}

@ -31,12 +31,12 @@ import java.util.HashMap;
import java.util.Iterator;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.peers.NewsDB;
@ -136,7 +136,7 @@ public class Surftips {
url = row.getPrimaryKeyUTF8().trim();
try{
if(Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS ,new DigestURI(url)))
if(Switchboard.urlBlacklist.isListed(BlacklistType.SURFTIPS ,new DigestURL(url)))
continue;
}catch(final MalformedURLException e){continue;}
title = row.getColUTF8(1);
@ -306,13 +306,13 @@ public class Surftips {
// add/subtract votes and write record
if (entry != null) {
try {
urlhash = UTF8.String((new DigestURI(url)).hash());
urlhash = UTF8.String((new DigestURL(url)).hash());
} catch (final MalformedURLException e) {
urlhash = null;
}
if (urlhash == null)
try {
urlhash = UTF8.String((new DigestURI("http://"+url)).hash());
urlhash = UTF8.String((new DigestURL("http://"+url)).hash());
} catch (final MalformedURLException e) {
urlhash = null;
}

@ -29,7 +29,7 @@ import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;

@ -5,7 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;

@ -25,7 +25,7 @@ import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;

@ -22,10 +22,10 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.http.HTTPDemon;
@ -70,7 +70,7 @@ public class Triple_p {
for (String s: list.split("\n")) {
String newurl = s;
try {
DigestURI d = new DigestURI (s);
DigestURL d = new DigestURL (s);
if (d.getHost().endsWith(".yacy")) {
newurl = d.getProtocol()+"://"+HTTPDemon.getAlternativeResolver().resolve(d.getHost())+d.getPath();

@ -36,9 +36,10 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
@ -54,7 +55,6 @@ import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -107,7 +107,7 @@ public class ViewFile {
final String viewMode = post.get("viewMode","parsed");
prop.put("error_vMode-" + viewMode, "1");
DigestURI url = null;
DigestURL url = null;
String descr = "";
final int wordCount = 0;
int size = 0;
@ -127,7 +127,7 @@ public class ViewFile {
}
// define an url by post parameter
url = new DigestURI(MultiProtocolURI.unescape(urlString));
url = new DigestURL(MultiProtocolURL.unescape(urlString));
urlHash = ASCII.String(url.hash());
pre = post.getBoolean("pre");
} catch (final MalformedURLException e) {}
@ -185,7 +185,7 @@ public class ViewFile {
}
final String[] wordArray = wordArray(post.get("words", null));
final String ext = MultiProtocolURI.getFileExtension(url.getFileName());
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ?
@ -311,11 +311,11 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
boolean dark = true;
int i = 0;
i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
final Map<DigestURI, ImageEntry> ts = document.getImages();
final Map<DigestURL, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {
@ -335,8 +335,8 @@ public class ViewFile {
dark = !dark;
i++;
}
i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors());
i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0));
i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0));
prop.put("viewMode_links", i);
}
@ -439,13 +439,12 @@ public class ViewFile {
final serverObjects prop,
final String[] wordArray,
int c,
final Map<DigestURI, String> media,
final Map<DigestURL, String> media,
final String type,
boolean dark,
final Map<DigestURI, Properties> alllinks) {
boolean dark) {
int i = 0;
for (final Map.Entry<DigestURI, String> entry : media.entrySet()) {
final Properties p = alllinks.get(entry.getKey());
for (final Map.Entry<DigestURL, String> entry : media.entrySet()) {
final Properties p = entry.getKey().getProperties();
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag

@ -32,6 +32,7 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Map;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@ -41,7 +42,6 @@ import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.URLLicense;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -74,9 +74,9 @@ public class ViewImage {
final String urlLicense = post.get("code", "");
final boolean auth = Domains.isLocalhost(header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")) || sb.verifyAuthentication(header); // handle access rights
DigestURI url = null;
DigestURL url = null;
if ((urlString.length() > 0) && (auth)) try {
url = new DigestURI(urlString);
url = new DigestURL(urlString);
} catch (final MalformedURLException e1) {
url = null;
}
@ -84,7 +84,7 @@ public class ViewImage {
if ((url == null) && (urlLicense.length() > 0)) {
urlString = URLLicense.releaseLicense(urlLicense);
try {
url = new DigestURI(urlString);
url = new DigestURL(urlString);
} catch (final MalformedURLException e1) {
url = null;
urlString = null;

@ -26,7 +26,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Owl;
import net.yacy.cora.lod.vocabulary.Tagging;
@ -35,7 +36,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -59,8 +59,8 @@ public class Vocabulary_p {
// create a vocabulary
if (discovername != null && discovername.length() > 0) {
String discoverobjectspace = post.get("discoverobjectspace", "");
MultiProtocolURI discoveruri = null;
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (final MalformedURLException e) {}
MultiProtocolURL discoveruri = null;
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {}
if (discoveruri == null) discoverobjectspace = "";
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
@ -72,9 +72,9 @@ public class Vocabulary_p {
Segment segment = sb.index;
String t;
if (!discoverNot) {
Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, 600000L, 100000);
while (ui.hasNext()) {
DigestURI u = ui.next();
DigestURL u = ui.next();
String u0 = u.toNormalform(true);
t = "";
if (discoverFromPath) {
@ -131,7 +131,7 @@ public class Vocabulary_p {
if (post.get("add_new", "").equals("checked") && post.get("newterm", "").length() > 0) {
String objectlink = post.get("newobjectlink", "");
if (objectlink.length() > 0) try {
objectlink = new MultiProtocolURI(objectlink).toNormalform(true);
objectlink = new MultiProtocolURL(objectlink).toNormalform(true);
} catch (final MalformedURLException e) {}
vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", ""), objectlink);
}

@ -6,11 +6,11 @@
import java.util.Iterator;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -64,7 +64,7 @@ public class WatchWebStructure_p {
// fix start point if a "www."-prefix would be better
if (host != null && !host.startsWith("www")) {
if (sb.webStructure.referencesCount(DigestURI.hosthash6("www." + host)) > sb.webStructure.referencesCount(DigestURI.hosthash6(host))) {
if (sb.webStructure.referencesCount(DigestURL.hosthash6("www." + host)) > sb.webStructure.referencesCount(DigestURL.hosthash6(host))) {
host = "www." + host;
}
}

@ -33,12 +33,12 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -116,7 +116,7 @@ public class WebStructurePicture_p {
for (int i = 0; i < hostlist.length; i++) {
String host = hostlist[i];
String hash = null;
try {hash = ASCII.String((new DigestURI("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {ConcurrentLog.logException(e);}
try {hash = ASCII.String((new DigestURL("http://" + host)).hash(), 6, 6);} catch (final MalformedURLException e) {ConcurrentLog.logException(e);}
Map.Entry<String, String> centernode = new AbstractMap.SimpleEntry<String, String>(hash, host);
double angle = 2.0d * i * Math.PI / hostlist.length;
if (hostlist.length == 3) angle -= Math.PI / 2;

@ -38,7 +38,7 @@ import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;

@ -1,7 +1,7 @@
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;

@ -1,9 +1,9 @@
import java.net.MalformedURLException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -23,7 +23,7 @@ public class delete_p {
return prop;
}
try {
if (post.containsKey("url") && switchboard.bookmarksDB.removeBookmark(ASCII.String((new DigestURI(post.get("url", "nourl"))).hash()))) {
if (post.containsKey("url") && switchboard.bookmarksDB.removeBookmark(ASCII.String((new DigestURL(post.get("url", "nourl"))).hash()))) {
prop.put("result", "1");
} else if (post.containsKey("urlhash") && switchboard.bookmarksDB.removeBookmark(post.get("urlhash", "nohash"))) {
prop.put("result", "1");

@ -30,12 +30,12 @@ import java.util.TreeSet;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.document.SentenceReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionSchema;
@ -56,7 +56,7 @@ public class citation {
prop.put("citations", 0);
prop.put("sentences", 0);
DigestURI uri = null;
DigestURL uri = null;
String url = "";
String hash = "";
int ch = 10;
@ -81,7 +81,7 @@ public class citation {
if (url.length() > 0) {
try {
uri = new DigestURI(url, null);
uri = new DigestURL(url, null);
hash = ASCII.String(uri.hash());
} catch (final MalformedURLException e) {}
}
@ -118,7 +118,7 @@ public class citation {
// for each line make a statistic about the number of occurrences somewhere else
OrderedScoreMap<String> scores = new OrderedScoreMap<String>(null); // accumulates scores for citating urls
LinkedHashMap<String, Set<DigestURI>> sentenceOcc = new LinkedHashMap<String, Set<DigestURI>>();
LinkedHashMap<String, Set<DigestURL>> sentenceOcc = new LinkedHashMap<String, Set<DigestURL>>();
for (String sentence: sentences) {
if (sentence == null || sentence.length() < 40) {
// do not count the very short sentences
@ -130,12 +130,12 @@ public class citation {
SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName());
int count = (int) doclist.getNumFound();
if (count > 0) {
Set<DigestURI> list = new TreeSet<DigestURI>();
Set<DigestURL> list = new TreeSet<DigestURL>();
for (SolrDocument d: doclist) {
String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName());
if (u == null || u.equals(url)) continue;
scores.inc(u);
try {list.add(new DigestURI(u, null));} catch (final MalformedURLException e) {}
try {list.add(new DigestURL(u, null));} catch (final MalformedURLException e) {}
}
sentenceOcc.put(sentence, list);
}
@ -147,13 +147,13 @@ public class citation {
// iterate the sentences
int i = 0;
for (Map.Entry<String, Set<DigestURI>> se: sentenceOcc.entrySet()) {
for (Map.Entry<String, Set<DigestURL>> se: sentenceOcc.entrySet()) {
prop.put("sentences_" + i + "_dt", i);
StringBuilder dd = new StringBuilder(se.getKey());
Set<DigestURI> app = se.getValue();
Set<DigestURL> app = se.getValue();
if (app != null && app.size() > 0) {
dd.append("<br/>appears in:");
for (DigestURI u: app) {
for (DigestURL u: app) {
if (u != null) {
dd.append(" <a href=\"").append(u.toNormalform(false)).append("\">").append(u.getHost()).append("</a>");
}
@ -168,12 +168,12 @@ public class citation {
i = 0;
for (String u: scores.keyList(false)) {
try {
DigestURI uu = new DigestURI(u, null);
DigestURL uu = new DigestURL(u, null);
prop.put("citations_" + i + "_dt", "<a href=\"" + u + "\">" + u + "</a>");
StringBuilder dd = new StringBuilder();
dd.append("makes ").append(Integer.toString(scores.get(u))).append(" citations: of ").append(url);
for (Map.Entry<String, Set<DigestURI>> se: sentenceOcc.entrySet()) {
Set<DigestURI> occurls = se.getValue();
for (Map.Entry<String, Set<DigestURL>> se: sentenceOcc.entrySet()) {
Set<DigestURL> occurls = se.getValue();
if (occurls != null && occurls.contains(uu)) dd.append("<br/><a href=\"/solr/select?q=text_t:%22").append(se.getKey().replace('"', '\'')).append("%22&rows=100&grep=&wt=grephtml\">").append(se.getKey()).append("</a>");
}
prop.put("citations_" + i + "_dd", dd.toString());
@ -187,7 +187,7 @@ public class citation {
for (String u: scores.keyList(false)) {
if (scores.get(u) < ch) continue;
try {
DigestURI uu = new DigestURI(u, null);
DigestURL uu = new DigestURL(u, null);
if (uu.getOrganization().equals(uri.getOrganization())) continue;
prop.put("similar_links_" + i + "_url", u);
i++;

@ -3,8 +3,8 @@
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.EventChannel;
import net.yacy.search.Switchboard;

@ -26,19 +26,21 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -90,9 +92,9 @@ public class getpageinfo {
url = "http://" + url;
}
if (actions.indexOf("title",0) >= 0) {
DigestURI u = null;
DigestURL u = null;
try {
u = new DigestURI(url);
u = new DigestURL(url);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
}
@ -129,11 +131,11 @@ public class getpageinfo {
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<DigestURI> uris = scraper.getAnchors().keySet();
final Collection<AnchorURL> uris = scraper.getAnchors();
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final DigestURI uri: uris) {
for (final DigestURL uri: uris) {
if (uri == null) continue;
links.append(';').append(uri.toNormalform(true));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
@ -147,7 +149,7 @@ public class getpageinfo {
}
if (actions.indexOf("robots",0) >= 0) {
try {
final DigestURI theURL = new DigestURI(url);
final DigestURL theURL = new DigestURL(url);
// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
@ -155,7 +157,7 @@ public class getpageinfo {
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
// get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
final MultiProtocolURL sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
@ -163,7 +165,7 @@ public class getpageinfo {
}
if (actions.indexOf("oai",0) >= 0) {
try {
final DigestURI theURL = new DigestURI(url
final DigestURL theURL = new DigestURL(url
+ "?verb=Identify");
final String oairesult = checkOAI(theURL.toString());

@ -26,19 +26,21 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -88,9 +90,9 @@ public class getpageinfo_p {
url = "http://" + url;
}
if (actions.indexOf("title",0) >= 0) {
DigestURI u = null;
DigestURL u = null;
try {
u = new DigestURI(url);
u = new DigestURL(url);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
}
@ -128,11 +130,11 @@ public class getpageinfo_p {
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<DigestURI> uris = scraper.getAnchors().keySet();
final Collection<AnchorURL> uris = scraper.getAnchors();
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final DigestURI uri: uris) {
for (final DigestURL uri: uris) {
if (uri == null) continue;
links.append(';').append(uri.toNormalform(true));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
@ -146,7 +148,7 @@ public class getpageinfo_p {
}
if (actions.indexOf("robots",0) >= 0) {
try {
final DigestURI theURL = new DigestURI(url);
final DigestURL theURL = new DigestURL(url);
// determine if crawling of the current URL is allowed
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
@ -156,7 +158,7 @@ public class getpageinfo_p {
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
// get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
final MultiProtocolURL sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
@ -164,7 +166,7 @@ public class getpageinfo_p {
}
if (actions.indexOf("oai",0) >= 0) {
try {
final DigestURI theURL = new DigestURI(url
final DigestURL theURL = new DigestURL(url
+ "?verb=Identify");
final String oairesult = checkOAI(theURL.toString());

@ -24,7 +24,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;

@ -25,7 +25,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.Rating;
import net.yacy.cora.util.ConcurrentLog;

@ -29,14 +29,14 @@ import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.peers.graphics.WebStructureGraph;
@ -56,7 +56,7 @@ public class webstructure {
prop.put("citations", 0);
boolean authenticated = sb.adminAuthenticated(header) >= 2;
if (about != null) {
DigestURI url = null;
DigestURL url = null;
byte[] urlhash = null;
String hosthash = null;
if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
@ -68,7 +68,7 @@ public class webstructure {
} else if (authenticated && about.length() > 0) {
// consider "about" as url or hostname
try {
url = new DigestURI(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
urlhash = url.hash();
hosthash = ASCII.String(urlhash, 6, 6);
} catch (final MalformedURLException e) {
@ -111,18 +111,18 @@ public class webstructure {
prop.put("references_documents_0_urle", url == null ? 0 : 1);
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true));
int d = 0;
Iterator<DigestURI> i = scraper.inboundLinks().iterator();
Iterator<DigestURL> i = scraper.inboundLinks().keySet().iterator();
while (i.hasNext()) {
DigestURI refurl = i.next();
DigestURL refurl = i.next();
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
prop.put("references_documents_0_anchors_" + d + "_outbound", 0);
d++;
}
i = scraper.outboundLinks().iterator();
i = scraper.outboundLinks().keySet().iterator();
while (i.hasNext()) {
DigestURI refurl = i.next();
DigestURL refurl = i.next();
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
@ -158,7 +158,7 @@ public class webstructure {
while (i.hasNext()) {
CitationReference cr = i.next();
byte[] refhash = cr.urlhash();
DigestURI refurl = authenticated ? sb.getURL(refhash) : null;
DigestURL refurl = authenticated ? sb.getURL(refhash) : null;
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);

@ -30,13 +30,13 @@ import java.util.Arrays;
import java.util.Iterator;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.Switchboard;
@ -90,7 +90,7 @@ public class yacydoc {
if (urlstring.length() > 0 && urlhash.isEmpty()) {
try {
final DigestURI url = new DigestURI(urlstring);
final DigestURL url = new DigestURL(urlstring);
urlhash = ASCII.String(url.hash());
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);

@ -1,5 +1,6 @@
import java.io.IOException;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -8,7 +9,6 @@ import net.yacy.data.ymark.YMarkEntry;
import net.yacy.data.ymark.YMarkTables;
import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -34,7 +34,7 @@ public class add_ymark {
if (post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes());
final DigestURL url = sb.index.fulltext().getURL(urlHash.getBytes());
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {

@ -4,6 +4,7 @@ import java.util.EnumMap;
import java.util.Iterator;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -16,7 +17,6 @@ import net.yacy.data.ymark.YMarkTables;
import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -49,7 +49,7 @@ public class get_metadata {
}
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.index);
final YMarkMetadata meta = new YMarkMetadata(new DigestURL(url), sb.index);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final Document document = meta.loadDocument(sb.loader, agent);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();

@ -7,7 +7,8 @@ import java.util.TreeMap;
import java.util.regex.Pattern;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -23,7 +24,6 @@ import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -212,7 +212,7 @@ public class get_treeview {
}
} else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) {
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.index);
final YMarkMetadata meta = new YMarkMetadata(new DigestURL(post.get(ROOT).substring(2)), sb.index);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final Document document = meta.loadDocument(sb.loader, agent);
final TreeMap<String, YMarkTag> tags = sb.tables.bookmarks.getTags(bmk_user);

@ -2,7 +2,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.UserDB;

@ -3,7 +3,7 @@ import java.util.Collection;
import java.util.Iterator;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.UserDB;

@ -12,7 +12,7 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;

@ -28,8 +28,9 @@
import java.awt.Image;
import java.io.File;
import java.io.IOException;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.ImageParser;
@ -45,7 +46,7 @@ public class cytag {
public static Image respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard)env;
final MultiProtocolURI referer = header.referer();
final MultiProtocolURL referer = header.referer();
// harvest request information
StringBuilder connect = new StringBuilder();

@ -25,7 +25,7 @@ import java.io.Writer;
import java.util.Date;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter;

@ -32,7 +32,7 @@ package interaction;
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.interaction.Interaction;

@ -27,7 +27,7 @@
import java.io.File;
import java.io.IOException;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.search.Switchboard;

@ -29,11 +29,11 @@ import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
@ -58,15 +58,15 @@ public class rct_p {
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack
DigestURI url;
DigestURL url;
try {
url = new DigestURI(item.getLink());
url = new DigestURL(item.getLink());
} catch (final MalformedURLException e) {
url = null;
}
Date loaddate;
loaddate = item.getPubDate();
final DigestURI referrer = null; // referrer needed!
final DigestURL referrer = null; // referrer needed!
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
if (urlRejectReason == null) {
// stack url
@ -101,7 +101,7 @@ public class rct_p {
* @param url
* @return
*/
private static String urlToString(final DigestURI url) {
private static String urlToString(final DigestURL url) {
return (url == null ? "null" : url.toNormalform(true));
}

@ -38,14 +38,14 @@ import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.ListManager;
import net.yacy.data.list.ListAccumulator;
import net.yacy.data.list.XMLBlacklistImporter;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist;
@ -137,7 +137,7 @@ public class sharedBlacklist_p {
// download the blacklist
try {
// get List
final DigestURI u = new DigestURI(downloadURLOld);
final DigestURL u = new DigestURL(downloadURLOld);
otherBlacklist = FileUtils.strings(u.get(agent));
} catch (final Exception e) {
@ -155,7 +155,7 @@ public class sharedBlacklist_p {
prop.putHTML("page_source", downloadURL);
try {
final DigestURI u = new DigestURI(downloadURL);
final DigestURL u = new DigestURL(downloadURL);
otherBlacklist = FileUtils.strings(u.get(agent));
} catch (final Exception e) {
prop.put("status", STATUS_URL_PROBLEM);

@ -27,7 +27,7 @@ import java.util.Map;
import javax.servlet.ServletException;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SolrServlet;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;

@ -29,7 +29,7 @@
import java.io.IOException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;

@ -24,7 +24,7 @@
import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceContainerCache;

@ -35,7 +35,7 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;

@ -36,10 +36,11 @@ import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -50,7 +51,6 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -241,7 +241,7 @@ public final class search {
null,
false,
null,
DigestURI.TLD_any_zone_filter,
DigestURL.TLD_any_zone_filter,
client,
false,
indexSegment,
@ -305,7 +305,7 @@ public final class search {
constraint,
false,
null,
DigestURI.TLD_any_zone_filter,
DigestURL.TLD_any_zone_filter,
client,
false,
sb.index,

@ -32,9 +32,9 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.federate.yacy.Distribution;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;

@ -33,8 +33,8 @@ import java.util.Map;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;

@ -28,12 +28,12 @@ import java.io.IOException;
import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.Protocol;
import net.yacy.search.Switchboard;
@ -65,7 +65,7 @@ public class urls {
final long timeout = System.currentTimeMillis() + maxTime;
int c = 0;
Request entry;
DigestURI referrer;
DigestURL referrer;
while ((maxCount > 0) &&
(System.currentTimeMillis() < timeout) &&
(sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
@ -112,7 +112,7 @@ public class urls {
final int count = urlhashes.length() / 12;
int c = 0;
URIMetadataNode entry;
DigestURI referrer;
DigestURL referrer;
for (int i = 0; i < count; i++) {
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
if (entry == null) continue;

@ -42,9 +42,9 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@ -62,7 +62,6 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.Formatter;
@ -606,7 +605,7 @@ public class yacysearch {
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash));
final DigestURL url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash));
if ( url != null ) {
try {
sb.tables.bookmarks.createBookmark(
@ -658,8 +657,8 @@ public class yacysearch {
clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted ? QueryParams.Searchdom.GLOBAL : QueryParams.Searchdom.LOCAL),
constraint,
true,
DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")),
DigestURI.TLD_any_zone_filter,
DigestURL.hosthashess(sb.getConfig("search.excludehosth", "")),
DigestURL.TLD_any_zone_filter,
client,
authenticated,
indexSegment,

@ -23,7 +23,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.protocol.Domains;

@ -28,11 +28,12 @@ import java.net.MalformedURLException;
import java.util.List;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -41,7 +42,6 @@ import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.data.URLLicense;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.Formatter;
import net.yacy.peers.NewsPool;
import net.yacy.peers.Seed;
@ -116,13 +116,13 @@ public class yacysearchitem {
final ResultEntry result = theSearch.oneResult(item, timeout);
if (result == null) return prop; // no content
final String resultUrlstring = result.urlstring();
final DigestURI resultURL = result.url();
final DigestURL resultURL = result.url();
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final int port = resultURL.getPort();
DigestURI faviconURL = null;
DigestURL faviconURL = null;
if ((fileType == FileType.HTML || fileType == FileType.JSON) && !sb.isIntranetMode()) try {
faviconURL = new DigestURI(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico");
faviconURL = new DigestURL(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico");
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
faviconURL = null;
@ -166,7 +166,7 @@ public class yacysearchitem {
// check if url is allowed to view
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
try {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI (modifyURL)) == null) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL (modifyURL)) == null) {
modifyURL = "./proxy.html?url="+modifyURL;
}
} catch (final MalformedURLException e) {
@ -177,7 +177,7 @@ public class yacysearchitem {
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("yacy")) {
try {
if ((new DigestURI (modifyURL).getHost().endsWith(".yacy"))) {
if ((new DigestURL (modifyURL).getHost().endsWith(".yacy"))) {
modifyURL = "./proxy.html?url="+modifyURL;
}
} catch (final MalformedURLException e) {
@ -245,7 +245,7 @@ public class yacysearchitem {
prop.put("content_heuristic_name", heuristic.heuristicName);
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false);
final String ext = MultiProtocolURI.getFileExtension(resultFileName).toLowerCase();
final String ext = MultiProtocolURL.getFileExtension(resultFileName).toLowerCase();
if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) {
final String license = URLLicense.aquireLicense(resultURL);
prop.put("content_code", license);

@ -27,8 +27,8 @@
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap;
@ -346,7 +346,7 @@ public class yacysearchtrailer {
if (count == 0) {
break;
}
nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString();
nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURL.escape(Tagging.encodePrintname(name)).toString();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {

@ -5,7 +5,7 @@ import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;

@ -28,7 +28,7 @@ import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
public class Classification {
@ -200,11 +200,11 @@ public class Classification {
return ext == null ? "application/octet-stream" : mimeTable.getProperty(ext.toLowerCase(), dfltMime);
}
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()), dfltMime);
public static String url2mime(final MultiProtocolURL url, final String dfltMime) {
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURL.getFileExtension(url.getFileName()), dfltMime);
}
public static String url2mime(final MultiProtocolURI url) {
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
public static String url2mime(final MultiProtocolURL url) {
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURL.getFileExtension(url.getFileName()));
}
}

@ -24,7 +24,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.encoding;
import java.util.Comparator;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.encoding;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;

@ -24,7 +24,8 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
public interface Channel extends Iterable<Hit> {

@ -24,7 +24,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
public class Channels {

@ -24,7 +24,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
import java.util.Date;
import java.util.List;

@ -18,7 +18,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
import java.net.MalformedURLException;
import java.util.Collections;
@ -29,6 +29,7 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
public class RSSFeed implements Iterable<RSSMessage> {
@ -53,11 +54,11 @@ public class RSSFeed implements Iterable<RSSMessage> {
* @param links
* @param source
*/
public RSSFeed(Set<MultiProtocolURI> links, String source) {
public RSSFeed(Set<MultiProtocolURL> links, String source) {
this(Integer.MAX_VALUE);
String u;
RSSMessage message;
for (MultiProtocolURI uri: links) {
for (MultiProtocolURL uri: links) {
u = uri.toNormalform(true);
message = new RSSMessage(u, "", u);
message.setAuthor(source);
@ -81,10 +82,10 @@ public class RSSFeed implements Iterable<RSSMessage> {
return this.imageURL;
}
public Set<MultiProtocolURI> getLinks() {
Set<MultiProtocolURI> links = new HashSet<MultiProtocolURI>();
public Set<MultiProtocolURL> getLinks() {
Set<MultiProtocolURL> links = new HashSet<MultiProtocolURL>();
for (RSSMessage message: this.messages.values()) {
try {links.add(new MultiProtocolURI(message.getLink()));} catch (final MalformedURLException e) {}
try {links.add(new MultiProtocolURL(message.getLink()));} catch (final MalformedURLException e) {}
}
return links;
}

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
import java.text.ParseException;
import java.util.ArrayList;
@ -37,6 +37,7 @@ import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.DublinCore;
import net.yacy.cora.lod.vocabulary.Geo;
import net.yacy.cora.protocol.HeaderFramework;
@ -123,7 +124,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
this.map.put(Token.guid.name(), artificialGuidPrefix + Integer.toHexString((title + description + link).hashCode()));
}
public RSSMessage(final String title, final String description, final MultiProtocolURI link, final String guid) {
public RSSMessage(final String title, final String description, final MultiProtocolURL link, final String guid) {
this.map = new HashMap<String, String>();
if (title.length() > 0) this.map.put(Token.title.name(), title);
if (description.length() > 0) this.map.put(Token.description.name(), description);

@ -18,7 +18,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
package net.yacy.cora.document.feed;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
@ -30,7 +30,8 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.RSSMessage.Token;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSMessage.Token;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;

@ -0,0 +1,68 @@
/**
* Anchor
* Copyright 2013 by Michael Peter Christen
* first published 15.09.2013 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document.id;
import java.net.MalformedURLException;
import java.util.Properties;
public class AnchorURL extends DigestURL {
private static final long serialVersionUID = 1586579902179962086L;
private Properties properties; // may contain additional url properties, such as given in html a href-links
public AnchorURL(final String url) throws MalformedURLException {
super(url);
this.properties = new Properties();
}
public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath);
this.properties = new Properties();
}
public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path);
this.properties = new Properties();
}
public Properties getProperties() {
return this.properties;
}
public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException {
if (relPath.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath;
}
if ((baseURL == null) ||
isHTTP(relPath) ||
isHTTPS(relPath) ||
isFTP(relPath) ||
isFile(relPath) ||
isSMB(relPath)/*||
relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) {
return new AnchorURL(relPath);
}
return new AnchorURL(baseURL, relPath);
}
}

@ -1,28 +1,24 @@
// DigestURI.java
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 13.07.2006 on http://yacy.net
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.data.meta;
/**
* DigestURL
* Copyright 2006 by Michael Peter Christen
* first published 13.07.2006 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document.id;
// this class exist to provide a system-wide normal form representation of urls,
// and to prevent that java.net.URL usage causes DNS queries which are used in java.net.
@ -31,18 +27,17 @@ import java.io.File;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.ByteArray;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.index.RowHandleSet;
/**
* URI-object providing YaCy-hash computation
@ -51,13 +46,14 @@ import net.yacy.kelondro.index.RowHandleSet;
* For URIs pointing to resources not globally available,
* the domainhash-part gets one reserved value
*/
public class DigestURI extends MultiProtocolURI implements Serializable {
public class DigestURL extends MultiProtocolURL implements Serializable {
private static final long serialVersionUID = -1173233022912141885L;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
// class variables
private byte[] hash;
private Properties properties; // may contain additional url properties, such as given in html a href-links
/**
* Shortcut, calculate hash for shorted url/hostname
@ -67,9 +63,9 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
public static String hosthash(final String host) {
String h = host;
if (!h.startsWith("http://")) h = "http://" + h;
DigestURI url = null;
DigestURL url = null;
try {
url = new DigestURI(h);
url = new DigestURL(h);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
return null;
@ -111,16 +107,17 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
/**
* DigestURI from File
*/
public DigestURI(final File file) throws MalformedURLException {
public DigestURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
/**
* DigestURI from URI string
*/
public DigestURI(final String url) throws MalformedURLException {
public DigestURL(final String url) throws MalformedURLException {
super(url);
this.hash = null;
this.properties = new Properties();
}
/**
@ -129,43 +126,36 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
* @param hash already calculated hash for url
* @throws MalformedURLException
*/
public DigestURI(final String url, final byte[] hash) throws MalformedURLException {
public DigestURL(final String url, final byte[] hash) throws MalformedURLException {
super(url);
this.hash = hash;
this.properties = new Properties();
}
/**
* DigestURI from general URI
* @param u
*/
/*
private DigestURI(final MultiProtocolURI u) {
super(u);
this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null;
}
*/
/**
* DigestURI from general URI, hash already calculated
* @param baseURL
* @param hash
*/
public DigestURI(final MultiProtocolURI baseURL, final byte[] hash) {
public DigestURL(final MultiProtocolURL baseURL, final byte[] hash) {
super(baseURL);
this.hash = hash;
this.properties = new Properties();
}
public DigestURI(final MultiProtocolURI baseURL, final String relPath) throws MalformedURLException {
public DigestURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath);
this.hash = null;
this.properties = new Properties();
}
public DigestURI(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
public DigestURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path);
this.hash = null;
this.properties = new Properties();
}
public static DigestURI newURL(final DigestURI baseURL, String relPath) throws MalformedURLException {
public static DigestURL newURL(final DigestURL baseURL, String relPath) throws MalformedURLException {
if (relPath.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath;
@ -177,13 +167,17 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
isFile(relPath) ||
isSMB(relPath)/*||
relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) {
return new DigestURI(relPath);
return new DigestURL(relPath);
}
return new DigestURI(baseURL, relPath);
return new DigestURL(baseURL, relPath);
}
private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful
public Properties getProperties() {
return this.properties;
}
@Override
public int hashCode() {
if (this.hashCache == Integer.MIN_VALUE) {
@ -302,25 +296,6 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
public final boolean probablyRootURL() {
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
}
public RowHandleSet getPossibleRootHashes() {
RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10);
String rootStub = this.getProtocol() + "://" + this.getHost();
try {
rootCandidates.put(new DigestURI(rootStub).hash());
rootCandidates.put(new DigestURI(rootStub + "/").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/index.php").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/home.php").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.htm").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.html").hash());
rootCandidates.put(new DigestURI(rootStub + "/default.php").hash());
} catch (final Throwable e) {}
return rootCandidates;
}
private static final String hosthash5(final String protocol, final String host, final int port) {
if (host == null) {

@ -23,7 +23,7 @@
*/
package net.yacy.cora.document;
package net.yacy.cora.document.id;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
@ -47,9 +47,10 @@ import java.util.regex.Pattern;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.TimeoutRequest;
@ -61,9 +62,9 @@ import net.yacy.cora.util.CommonPattern;
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
*
*/
public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolURI> {
public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolURL> {
public static final MultiProtocolURI POISON = new MultiProtocolURI(); // poison pill for concurrent link generators
public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators
private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
private static final long serialVersionUID = -1173233022912141884L;
@ -96,7 +97,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
/**
* initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues
*/
public MultiProtocolURI() {
public MultiProtocolURL() {
this.protocol = null;
this.host = null;
this.hostAddress = null;
@ -108,11 +109,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.port = -1;
}
public MultiProtocolURI(final File file) throws MalformedURLException {
public MultiProtocolURL(final File file) throws MalformedURLException {
this("file", "", -1, file.getAbsolutePath());
}
protected MultiProtocolURI(final MultiProtocolURI url) {
protected MultiProtocolURL(final MultiProtocolURL url) {
this.protocol = url.protocol;
this.host = url.host;
this.hostAddress = null;
@ -124,7 +125,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.port = url.port;
}
public MultiProtocolURI(String url) throws MalformedURLException {
public MultiProtocolURL(String url) throws MalformedURLException {
if (url == null) throw new MalformedURLException("url string is null");
this.hostAddress = null;
@ -275,7 +276,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return this.contentDomain;
}
public static MultiProtocolURI newURL(final String baseURL, String relPath) throws MalformedURLException {
public static MultiProtocolURL newURL(final String baseURL, String relPath) throws MalformedURLException {
if (relPath.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
relPath = "http:" + relPath;
@ -287,12 +288,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
isFile(relPath) ||
isSMB(relPath)/*||
relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) {
return new MultiProtocolURI(relPath);
return new MultiProtocolURL(relPath);
}
return new MultiProtocolURI(new MultiProtocolURI(baseURL), relPath);
return new MultiProtocolURL(new MultiProtocolURL(baseURL), relPath);
}
public static MultiProtocolURI newURL(final MultiProtocolURI baseURL, String relPath) throws MalformedURLException {
public static MultiProtocolURL newURL(final MultiProtocolURL baseURL, String relPath) throws MalformedURLException {
if (relPath.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath;
@ -304,12 +305,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
isFile(relPath) ||
isSMB(relPath)/*||
relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) {
return new MultiProtocolURI(relPath);
return new MultiProtocolURL(relPath);
}
return new MultiProtocolURI(baseURL, relPath);
return new MultiProtocolURL(baseURL, relPath);
}
public MultiProtocolURI(final MultiProtocolURI baseURL, String relPath) throws MalformedURLException {
public MultiProtocolURL(final MultiProtocolURL baseURL, String relPath) throws MalformedURLException {
if (baseURL == null) throw new MalformedURLException("base URL is null");
if (relPath == null) throw new MalformedURLException("relPath is null");
@ -361,7 +362,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
escape();
}
public MultiProtocolURI(final String protocol, String host, final int port, final String path) throws MalformedURLException {
public MultiProtocolURL(final String protocol, String host, final int port, final String path) throws MalformedURLException {
if (protocol == null) throw new MalformedURLException("protocol is null");
if (host.indexOf(':') >= 0 && host.charAt(0) != '[') host = '[' + host + ']'; // IPv6 host must be enclosed in square brackets
this.protocol = protocol;
@ -948,8 +949,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public boolean equals(final Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (!(obj instanceof MultiProtocolURI)) return false;
final MultiProtocolURI other = (MultiProtocolURI) obj;
if (!(obj instanceof MultiProtocolURL)) return false;
final MultiProtocolURL other = (MultiProtocolURL) obj;
return
((this.protocol == null && other.protocol == null) || (this.protocol != null && other.protocol != null && this.protocol.equals(other.protocol))) &&
@ -961,7 +962,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
@Override
public int compareTo(final MultiProtocolURI h) {
public int compareTo(final MultiProtocolURL h) {
int c;
if (this.protocol != null && h.protocol != null && (c = this.protocol.compareTo(h.protocol)) != 0) return c;
if (this.host != null && h.host != null && (c = this.host.compareTo(h.host)) != 0) return c;
@ -2167,12 +2168,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
};
//MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File("defaults/sessionid.names")));
String environment, url;
MultiProtocolURI aURL, aURL1;
MultiProtocolURL aURL, aURL1;
java.net.URL jURL;
for (String[] element : test) {
environment = element[0];
url = element[1];
try {aURL = MultiProtocolURI.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
try {aURL = MultiProtocolURL.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
if (environment == null) {
try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}
} else {
@ -2190,7 +2191,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// check stability: the normalform of the normalform must be equal to the normalform
if (aURL != null) try {
aURL1 = new MultiProtocolURI(aURL.toNormalform(false));
aURL1 = new MultiProtocolURL(aURL.toNormalform(false));
if (!(aURL1.toNormalform(false).equals(aURL.toNormalform(false)))) {
System.out.println("no stability for url:");
System.out.println("aURL0=" + aURL.toString());

@ -21,7 +21,7 @@
* USA
*/
package net.yacy.cora.document;
package net.yacy.cora.document.id;
public class Punycode {

@ -30,11 +30,11 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.SearchAccumulator;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
@ -163,9 +163,9 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final CacheStrategy cacheStrategy,
final boolean global,
final ClientIdentification.Agent agent) throws IOException {
MultiProtocolURI uri = null;
MultiProtocolURL uri = null;
try {
uri = new MultiProtocolURI(rssSearchServiceURL);
uri = new MultiProtocolURL(rssSearchServiceURL);
} catch (final MalformedURLException e) {
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
}
@ -182,7 +182,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
parts.put("nav", UTF8.StringBody("none"));
// result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
final HTTPClient httpClient = new HTTPClient(agent);
result = httpClient.POSTbytes(new MultiProtocolURI(rssSearchServiceURL), uri.getHost(), parts, false);
result = httpClient.POSTbytes(new MultiProtocolURL(rssSearchServiceURL), uri.getHost(), parts, false);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (reader == null) {

@ -34,11 +34,11 @@ import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
@ -78,7 +78,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURI url) {
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String hostid = url.hosthash();
@ -149,7 +149,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return changed;
}
public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration clickdepthfield) {
public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURL url, SchemaDeclaration clickdepthfield) {
if (!this.contains(clickdepthfield)) return false;
// get new click depth and compare with old
Integer oldclickdepth = (Integer) doc.getFieldValue(clickdepthfield.getSolrFieldName());
@ -165,7 +165,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return false;
}
public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map<String, Long> hostExtentCount) {
public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURL url, Map<String, Long> hostExtentCount) {
if (!(this.contains(CollectionSchema.references_i) ||
this.contains(CollectionSchema.references_internal_i) ||
this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false;

@ -40,10 +40,9 @@ import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import org.apache.lucene.document.Document;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.core.SolrCore;
@ -165,7 +164,7 @@ public class SolrServlet implements Filter {
int sz = ids.size();
for (int i = 0; i < sz; i++) {
int id = iterator.nextDoc();
Document doc = searcher.doc(id);
searcher.doc(id);
}
}
}

@ -33,7 +33,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.LookAheadIterator;

@ -30,7 +30,7 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;

@ -27,7 +27,7 @@ import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.schema.CollectionSchema;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save