From 788288eb9ec59cedaba3866ff73c766d5d059e96 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 22 Feb 2013 15:45:15 +0100 Subject: [PATCH] added the generation of 50 (!!) new solr field in the core 'webgraph'. The default schema uses only some of them and the resting search index has now the following properties: - webgraph size will have about 40 times as much entries as default index - the complete index size will increase and may be about the double size of current amount As testing showed, not much indexing performance is lost. The default index will be smaller (moved fields out of it); thus searching can be faster. The new index will cause that some old parts in YaCy can be removed, i.e. specialized webgraph data and the noload crawler. The new index will make it possible to: - search within link texts of linked but not indexed documents (about 20 times of document index in size!!) - get a very detailed link graph - enhance ranking using a complete link graph To get the full access to the new index, the API to solr has now two access points: one with attribute core=collection1 for the default search index and core=webgraph to the new webgraph search index. This is also avaiable for p2p operation but client access is not yet implemented. --- .classpath | 4 +- defaults/solr.collection.schema | 42 ---- defaults/solr.webgraph.schema | 41 ++-- htroot/ConfigHeuristics_p.java | 3 +- htroot/CrawlStartScanner_p.java | 12 +- htroot/Crawler_p.html | 16 +- htroot/Crawler_p.java | 8 +- htroot/HostBrowser.java | 8 +- htroot/IndexControlRWIs_p.java | 2 +- htroot/IndexControlURLs_p.java | 4 +- htroot/IndexSchema_p.html | 3 +- htroot/IndexShare_p.java | 4 +- htroot/Load_RSS_p.java | 5 +- htroot/ServerScannerList.java | 2 +- htroot/ViewFile.java | 8 +- htroot/api/getpageinfo.java | 4 +- htroot/api/getpageinfo_p.java | 4 +- htroot/api/status_p.java | 3 +- htroot/api/status_p.xml | 1 + htroot/api/webstructure.java | 7 +- htroot/env/templates/header.template | 5 +- htroot/gsa/searchresult.java | 2 +- htroot/js/Crawler.js | 2 + htroot/solr/select.java | 5 +- htroot/yacy/query.java | 2 +- htroot/yacy/transferRWI.java | 2 +- htroot/yacy/transferURL.java | 2 +- .../opensearch/OpenSearchConnector.java | 2 +- .../solr/connector/AbstractSolrConnector.java | 9 + .../solr/connector/EmbeddedSolrConnector.java | 8 +- .../solr/connector/SolrConnector.java | 13 +- .../solr/connector/SolrServerConnector.java | 30 +++ .../solr/instance/InstanceMirror.java | 43 +++- .../yacy/cora/protocol/HeaderFramework.java | 6 +- .../net/yacy/cora/protocol/RequestHeader.java | 5 +- source/net/yacy/cora/protocol/Scanner.java | 5 +- source/net/yacy/crawler/CrawlStacker.java | 8 +- .../net/yacy/crawler/data/ResultImages.java | 2 +- .../yacy/crawler/retrieval/HTTPLoader.java | 7 +- .../net/yacy/crawler/retrieval/RSSLoader.java | 3 +- .../crawler/retrieval/SitemapImporter.java | 2 +- source/net/yacy/data/BookmarkHelper.java | 9 +- source/net/yacy/document/Condenser.java | 5 +- source/net/yacy/document/Document.java | 115 ++++++----- .../document/parser/html/ContentScraper.java | 101 +++++----- .../yacy/document/parser/html/EmbedEntry.java | 8 +- .../yacy/document/parser/html/ImageEntry.java | 8 +- .../parser/html/ScraperInputStream.java | 4 +- .../net/yacy/document/parser/htmlParser.java | 3 +- .../parser/images/genericImageParser.java | 12 +- .../net/yacy/document/parser/rssParser.java | 7 +- .../yacy/document/parser/sevenzipParser.java | 3 +- .../yacy/document/parser/sitemapParser.java | 3 +- .../net/yacy/document/parser/swfParser.java | 5 +- .../net/yacy/document/parser/tarParser.java | 3 +- .../net/yacy/document/parser/vcfParser.java | 5 +- .../net/yacy/document/parser/zipParser.java | 3 +- .../yacy/kelondro/data/meta/DigestURI.java | 24 ++- source/net/yacy/peers/Transmission.java | 2 +- .../peers/graphics/WebStructureGraph.java | 21 +- .../net/yacy/peers/operation/yacyRelease.java | 4 +- .../net/yacy/repository/LoaderDispatcher.java | 3 +- source/net/yacy/search/Switchboard.java | 46 ++--- source/net/yacy/search/index/Fulltext.java | 123 ++++++++---- source/net/yacy/search/index/Segment.java | 45 +++-- source/net/yacy/search/query/QueryParams.java | 11 +- .../schema/CollectionConfiguration.java | 165 +++++----------- .../yacy/search/schema/CollectionSchema.java | 10 +- .../search/schema/WebgraphConfiguration.java | 184 ++++++++++++++++++ .../yacy/search/schema/WebgraphSchema.java | 31 +-- .../net/yacy/search/snippet/MediaSnippet.java | 11 +- .../yacy/server/http/HTTPDProxyHandler.java | 6 +- 72 files changed, 783 insertions(+), 541 deletions(-) diff --git a/.classpath b/.classpath index 0b3ea7ce7..f832ab578 100644 --- a/.classpath +++ b/.classpath @@ -1,7 +1,7 @@ - + @@ -10,7 +10,7 @@ - + diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index bc045020c..ac1f9f532 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -214,27 +214,6 @@ inboundlinks_protocol_sxt ## internal links, the url only without the protocol inboundlinks_urlstub_txt -## internal links, the name property of the a-tag -#inboundlinks_name_txt - -## internal links, the rel property of the a-tag -#inboundlinks_rel_sxt - -## internal links, the rel property of the a-tag, coded binary -#inboundlinks_relflags_val - -## internal links, the text content of the a-tag -#inboundlinks_text_txt - -## internal links, the length of the a-tag as number of characters -#inboundlinks_text_chars_val - -## internal links, the length of the a-tag as number of words -#inboundlinks_text_words_val - -##if the link is an image link, this contains the alt tag if the image is also liked as img link -#inboundlinks_alttag_txt - ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow #outboundlinks_tag_txt @@ -244,27 +223,6 @@ outboundlinks_protocol_sxt ## external links, the url only without the protocol outboundlinks_urlstub_txt -## external links, the name property of the a-tag -#outboundlinks_name_txt - -## external links, the rel property of the a-tag -#outboundlinks_rel_sxt - -## external links, the rel property of the a-tag, coded binary -#outboundlinks_relflags_val - -## external links, the text content of the a-tag -#outboundlinks_text_txt - -## external links, the length of the a-tag as number of characters -#outboundlinks_text_chars_val - -## external links, the length of the a-tag as number of words -#outboundlinks_text_words_val - -##if the link is an image link, this contains the alt tag if the image is also liked as img link -#outboundlinks_alttag_txt - ## all image tags, encoded as tag inclusive alt- and title property #images_tag_txt diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index f48931681..7e69a7c8a 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -15,6 +15,12 @@ ## primary key of document, a combination of (28 characters) id +## last-modified from http header, date (mandatory field) +last_modified + +## time when resource was loaded +load_date_dt + ## tags that are attached to crawls/index generation to separate the search result into user-defined subsets collection_sxt @@ -26,21 +32,18 @@ collection_sxt ## primary key of document, the URL hash (source) source_id_s -## the url of the document (source) -#source_url_s +## the protocol of the url (source) +#source_protocol_s + +## the url without the protocol (source) +#source_urlstub_s ## the file name extension (source) #source_file_ext_s -## normalized (absolute URLs), as - tag with anchor text and nofollow (source) -#source_tag_s - ## number of all characters in the url (source) #source_chars_i -## the protocol of the url (source) -#source_protocol_s - ## path of the url (source) #source_path_s @@ -62,9 +65,12 @@ source_id_s ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source) #source_clickdepth_i -## host of the url +## host of the url (source) #source_host_s +## id of the host (source) +source_host_id_s + ## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source) #source_host_dnc_s @@ -117,8 +123,11 @@ target_name_t ## primary key of document, the URL hash (target) target_id_s -## the url of the document (target) -target_url_s +## the protocol of the url (target) +target_protocol_s + +## the url without the protocol (target) +target_urlstub_s ## the file name extension (target) target_file_ext_s @@ -129,9 +138,6 @@ target_file_ext_s ## number of all characters in the url (target) #target_chars_i -## the protocol of the url (target) -target_protocol_s - ## path of the url (target) #target_path_s @@ -156,6 +162,9 @@ target_path_folders_sxt ## host of the url (target) #target_host_s +## id of the host (target) +target_host_id_s + ## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target) #target_host_dnc_s @@ -168,5 +177,5 @@ target_path_folders_sxt ## the remaining part of the host without organizationdnc (target) #target_host_subdomain_s - - +## flag shows if the target host is equal to the source host +target_inbound_b diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 32af59772..c0d665a3e 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -121,7 +121,8 @@ public class ConfigHeuristics_p { } try { sb.index.fulltext().getDefaultConfiguration().commit(); - } catch (IOException ex) {} + } catch (IOException e) { + } } } diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index 1482991b0..b08cdb19f 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -143,13 +143,12 @@ public class CrawlStartScanner_p if ( post.containsKey("crawl") ) { // make a pk/url mapping final Iterator> se = Scanner.scancacheEntries(); - final Map pkmap = - new TreeMap(Base64Order.enhancedCoder); + final Map pkmap = new TreeMap(Base64Order.enhancedCoder); while (se.hasNext()) { final Scanner.Service u = se.next().getKey(); DigestURI uu; try { - uu = DigestURI.toDigestURI(u.url()); + uu = u.url(); pkmap.put(uu.hash(), uu); } catch ( final MalformedURLException e ) { Log.logException(e); @@ -197,15 +196,14 @@ public class CrawlStartScanner_p String urlString; DigestURI u; try { - final Iterator> se = - Scanner.scancacheEntries(); + final Iterator> se = Scanner.scancacheEntries(); Map.Entry host; while ( se.hasNext() ) { host = se.next(); try { - u = DigestURI.toDigestURI(host.getKey().url()); + u = host.getKey().url(); urlString = u.toNormalform(true); - if ( host.getValue() == Access.granted + if (host.getValue() == Access.granted && Scanner.inIndex(apiCommentCache, urlString) == null ) { String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 229eaddd1..80769580c 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -21,7 +21,7 @@ #%env/templates/submenuCrawlMonitor.template%#

Crawler

-
+
Queues @@ -74,20 +74,24 @@
-
+
Index Size - - + + - + - + + + + + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index fc76cf17b..003958f8b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -36,7 +36,6 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; @@ -73,6 +72,7 @@ public class Crawler_p { final serverObjects prop = new serverObjects(); prop.put("rejected", 0); prop.put("urlpublictextSize", 0); + prop.put("webgraphSize", 0); prop.put("rwipublictextSize", 0); prop.put("list", "0"); prop.put("loaderSize", 0); @@ -277,8 +277,8 @@ public class Crawler_p { try { scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); // get links and generate filter - for (MultiProtocolURI u: scraper.getAnchors().keySet()) { - newRootURLs.add(DigestURI.toDigestURI(u)); + for (DigestURI u: scraper.getAnchors().keySet()) { + newRootURLs.add(u); } } catch (IOException e) { Log.logException(e); @@ -475,7 +475,7 @@ public class Crawler_p { writer.close(); // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); + final Map hyperlinks = scraper.getAnchors(); if (newcrawlingdepth > 0) { if (fullDomain) { newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet()); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 3f9912fbf..1097255e0 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -79,7 +79,7 @@ public class HostBrowser { // set default values prop.put("path", ""); prop.put("result", ""); - prop.putNum("ucount", fulltext.size()); + prop.putNum("ucount", fulltext.collectionSize()); prop.put("hosts", 0); prop.put("files", 0); prop.put("admin", 0); @@ -117,7 +117,7 @@ public class HostBrowser { String load = post.get("load", ""); boolean wait = false; - if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(pathURI.hash())) { + if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) { // in case that the url does not exist and loading is wanted turn this request into a loading request load = path; wait = true; @@ -136,7 +136,7 @@ public class HostBrowser { )); prop.put("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) for (int i = 0; i < 30; i++) { - if (sb.index.exists(url.hash())) break; + if (sb.index.exists(ASCII.String(url.hash()))) break; try {Thread.sleep(100);} catch (InterruptedException e) {} } } catch (MalformedURLException e) { @@ -480,7 +480,7 @@ public class HostBrowser { } // insert constants - prop.putNum("ucount", fulltext.size()); + prop.putNum("ucount", fulltext.collectionSize()); // return rewrite properties return prop; } diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index d81fae130..8b16efa76 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -280,7 +280,7 @@ public class IndexControlRWIs_p { Reference iEntry; while (urlIter.hasNext()) { iEntry = urlIter.next(); - if (!segment.fulltext().exists(iEntry.urlhash())) { + if (!segment.fulltext().exists(ASCII.String(iEntry.urlhash()))) { try { unknownURLEntries.put(iEntry.urlhash()); } catch (final SpaceExceededException e) { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 2fd724c99..6e2b1ec87 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -66,7 +66,7 @@ public class IndexControlURLs_p { prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", ""); - prop.putNum("ucount", segment.fulltext().size()); + prop.putNum("ucount", segment.fulltext().collectionSize()); prop.put("otherHosts", ""); prop.put("genUrlProfile", 0); prop.put("statistics", 1); @@ -312,7 +312,7 @@ public class IndexControlURLs_p { } // insert constants - prop.putNum("ucount", segment.fulltext().size()); + prop.putNum("ucount", segment.fulltext().collectionSize()); // return rewrite properties return prop; } diff --git a/htroot/IndexSchema_p.html b/htroot/IndexSchema_p.html index 81283a04c..64d701fcc 100644 --- a/htroot/IndexSchema_p.html +++ b/htroot/IndexSchema_p.html @@ -16,12 +16,13 @@

If you use a custom Solr schema you may enter a different field name in the column 'Custom Solr Field Name' of the YaCy default attribute name

- Select a Core: + Select a core: +    ... the core can be searched at /solr/select?core=#[core]#&q=*:*&start=0&rows=3
diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index 03e49f9ca..028023d06 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -51,7 +51,7 @@ public class IndexShare_p { prop.put("dtable", ""); prop.put("rtable", ""); prop.putNum("wcount", indexSegment.RWICount()); - prop.putNum("ucount", indexSegment.fulltext().size()); + prop.putNum("ucount", indexSegment.fulltext().collectionSize()); return prop; // be save } @@ -64,7 +64,7 @@ public class IndexShare_p { // insert constants prop.putNum("wcount", indexSegment.RWICount()); - prop.putNum("ucount", indexSegment.fulltext().size()); + prop.putNum("ucount", indexSegment.fulltext().collectionSize()); // return rewrite properties return prop; diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 4937965ac..bddc8a072 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Hit; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; @@ -272,7 +273,7 @@ public class Load_RSS_p { final RSSMessage message = feed.getMessage(entry.getValue().substring(5)); final DigestURI messageurl = new DigestURI(message.getLink()); if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop; - if (sb.urlExists(messageurl.hash()) != null) continue loop; + if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; sb.addToIndex(messageurl, null, null); RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); } catch (final IOException e) { @@ -317,7 +318,7 @@ public class Load_RSS_p { author = item.getAuthor(); if (author == null) author = item.getCopyright(); pubDate = item.getPubDate(); - prop.put("showitems_item_" + i + "_state", sb.urlExists(messageurl.hash()) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); + prop.put("showitems_item_" + i + "_state", sb.urlExists(ASCII.String(messageurl.hash())) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); prop.put("showitems_item_" + i + "_state_count", i); prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); diff --git a/htroot/ServerScannerList.java b/htroot/ServerScannerList.java index 4bd7b531c..4809fe914 100644 --- a/htroot/ServerScannerList.java +++ b/htroot/ServerScannerList.java @@ -63,7 +63,7 @@ public class ServerScannerList { while (se.hasNext()) { host = se.next(); try { - u = DigestURI.toDigestURI(host.getKey().url()); + u = host.getKey().url(); urlString = u.toNormalform(true); prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0); prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash())); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 244a9fddb..da65a4a9d 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -308,7 +308,7 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors()); dark = (i % 2 == 0); - final Map ts = document.getImages(); + final Map ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -432,12 +432,12 @@ public class ViewFile { final serverObjects prop, final String[] wordArray, int c, - final Map media, + final Map media, final String type, boolean dark, - final Map alllinks) { + final Map alllinks) { int i = 0; - for (final Map.Entry entry : media.entrySet()) { + for (final Map.Entry entry : media.entrySet()) { final Properties p = alllinks.get(entry.getKey()); final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index b5c073c3c..566e26b5e 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -127,11 +127,11 @@ public class getpageinfo { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Set uris = scraper.getAnchors().keySet(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final MultiProtocolURI uri: uris) { + for (final DigestURI uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index d3205a4d9..7946a2b70 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -127,11 +127,11 @@ public class getpageinfo_p { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Set uris = scraper.getAnchors().keySet(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final MultiProtocolURI uri: uris) { + for (final DigestURI uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index cf9529404..07eb120f4 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -76,7 +76,8 @@ public class status_p { prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER)); // index size - prop.putNum("urlpublictextSize", segment.fulltext().size()); + prop.putNum("urlpublictextSize", segment.fulltext().collectionSize()); + prop.putNum("webgraphSize", segment.fulltext().webgraphSize()); prop.putNum("rwipublictextSize", segment.RWICount()); // loader queue diff --git a/htroot/api/status_p.xml b/htroot/api/status_p.xml index 25c62234b..7eec1d761 100644 --- a/htroot/api/status_p.xml +++ b/htroot/api/status_p.xml @@ -21,6 +21,7 @@ #[urlpublictextSize]# + #[webgraphSize]# #[rwipublictextSize]# diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 53f7cbc95..5d834b87e 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -30,7 +30,6 @@ import java.util.Map; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; @@ -111,9 +110,9 @@ public class webstructure { prop.put("references_documents_0_urle", url == null ? 0 : 1); if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true)); int d = 0; - Iterator i = scraper.inboundLinks().iterator(); + Iterator i = scraper.inboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = DigestURI.toDigestURI(i.next()); + DigestURI refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); @@ -122,7 +121,7 @@ public class webstructure { } i = scraper.outboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = DigestURI.toDigestURI(i.next()); + DigestURI refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index ae5020f34..3b6c4b6c4 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -11,8 +11,9 @@
  • File Search
  • Host Browser
  • -
  • Embedded Solr API
  • -
  • Embedded GSA API
  • +
  • Solr Default Core
  • +
  • Solr Webgraph Core
  • +
  • Google Search API
  • Compare Search
  • URL Viewer
  • diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 78cb84ac5..1410a647e 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -169,7 +169,7 @@ public class searchresult { } // get the embedded connector - EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector(); if (connector == null) return null; // do the solr request diff --git a/htroot/js/Crawler.js b/htroot/js/Crawler.js index 208ba6b1e..a2890c26e 100644 --- a/htroot/js/Crawler.js +++ b/htroot/js/Crawler.js @@ -90,7 +90,9 @@ function handleStatus(){ dbsize=getFirstChild(statusTag, "dbsize"); urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); + webgraphSize=getValue(getFirstChild(dbsize, "webgraph")); document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; + document.getElementById("webgraphsize").firstChild.nodeValue=webgraphSize; document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; loaderqueue=getFirstChild(statusTag, "loaderqueue"); diff --git a/htroot/solr/select.java b/htroot/solr/select.java index 1ad96f2da..96ba19905 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -42,6 +42,8 @@ import net.yacy.search.SwitchboardConstants; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.QueryModifier; import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.CollectionSchema; +import net.yacy.search.schema.WebgraphSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -181,7 +183,8 @@ public class select { } // get the embedded connector - EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + boolean defaultConnector = post == null || post.get("core", CollectionSchema.CORE_NAME).equals(CollectionSchema.CORE_NAME); + EmbeddedSolrConnector connector = defaultConnector ? sb.index.fulltext().getDefaultEmbeddedConnector() : sb.index.fulltext().getEmbeddedConnector(WebgraphSchema.CORE_NAME); if (connector == null) return null; // do the solr request, generate facets if we use a special YaCy format diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index 88cd9284e..594d903b1 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -109,7 +109,7 @@ public final class query { if (obj.equals("lurlcount")) { // return the number of all available l-url's - prop.put("response", sb.index.fulltext().size()); + prop.put("response", sb.index.fulltext().collectionSize()); return prop; } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index e54f15ee4..14186eb7a 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -201,7 +201,7 @@ public final class transferRWI { // check if we need to ask for the corresponding URL if (!knownURL.has(urlHash) && !unknownURL.has(urlHash)) try { - if (sb.index.fulltext().exists(urlHash)) { + if (sb.index.fulltext().exists(ASCII.String(urlHash))) { knownURL.put(urlHash); } else { unknownURL.put(urlHash); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index acfa99138..bcaf3da08 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -139,7 +139,7 @@ public final class transferURL { } // doublecheck - if (sb.index.exists(lEntry.hash())) { + if (sb.index.exists(ASCII.String(lEntry.hash()))) { if (Network.log.isFine()) Network.log.logFine("transferURL: double URL '" + lEntry.url() + "' from peer " + otherPeerName); lEntry = null; doublecheck++; diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index c095bc0d8..b925bf0c0 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -175,7 +175,7 @@ public class OpenSearchConnector { if (sb == null) { return false; } - final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector(); // check if needed Solr fields are available (selected) if (connector == null) { Log.logSevere("OpenSearchConnector.Discover", "Error on connecting to embedded Solr index"); diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 72a87c21a..86b113ec4 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -44,6 +45,7 @@ import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.ModifiableSolrParams; @@ -285,4 +287,11 @@ public abstract class AbstractSolrConnector implements SolrConnector { throw new IOException(e.getMessage(), e); } } + + @Override + public void add(final Collection solrdocs) throws IOException, SolrException { + for (SolrInputDocument solrdoc: solrdocs) { + add(solrdoc); + } + } } diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index ffa6f19ae..98c68331a 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -58,6 +58,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo private final SearchHandler requestHandler; private final EmbeddedInstance instance; + private final String coreName; private SolrCore core; public EmbeddedSolrConnector(EmbeddedInstance instance) { @@ -68,6 +69,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo this.requestHandler.init(new NamedList()); this.requestHandler.inform(this.core); super.init(this.instance.getDefaultServer()); + this.coreName = ((EmbeddedSolrServer) this.server).getCoreContainer().getDefaultCoreName(); } public EmbeddedSolrConnector(EmbeddedInstance instance, String coreName) { @@ -78,6 +80,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo this.requestHandler.init(new NamedList()); this.requestHandler.inform(this.core); super.init(this.instance.getServer(coreName)); + this.coreName = coreName; } public SolrInstance getInstance() { @@ -104,9 +107,8 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo Thread.currentThread().setName("solr query: size"); EmbeddedSolrServer ess = (EmbeddedSolrServer) this.server; CoreContainer coreContainer = ess.getCoreContainer(); - String coreName = coreContainer.getDefaultCoreName(); - SolrCore core = coreContainer.getCore(coreName); - if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + coreName); + SolrCore core = coreContainer.getCore(this.coreName); + if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + this.coreName); try { SolrParams params = AbstractSolrConnector.catchSuccessQuery; diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 38a26431c..81a01afe4 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.BlockingQueue; @@ -74,8 +75,7 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * delete entries from solr according the given solr query string - * @param id the url hash of the entry - * @return the number of deletions + * @param querystring * @throws IOException */ public void deleteByQuery(final String querystring) throws IOException; @@ -96,6 +96,15 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws SolrException */ public void add(final SolrInputDocument solrdoc) throws IOException, SolrException; + + /** + * add a collection of solr input documents + * @param solrdocs + * @throws IOException + * @throws SolrException + */ + public void add(final Collection solrdoc) throws IOException, SolrException; + /** * get a field value from solr by given key for the id-field and a field name * @param key diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 537ca8b1b..dd6210540 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -22,6 +22,8 @@ package net.yacy.cora.federate.solr.connector; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; import java.util.List; import net.yacy.kelondro.logging.Log; @@ -197,4 +199,32 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen } } + @Override + public void add(final Collection solrdocs) throws IOException, SolrException { + if (this.server == null) return; + try { + for (SolrInputDocument solrdoc : solrdocs) { + if (solrdoc.containsKey("_version_")) solrdoc.setField("_version_",0L); // prevent Solr "version conflict" + } + synchronized (this.server) { + this.server.add(solrdocs, -1); + } + } catch (Throwable e) { + // catches "version conflict for": try this again and delete the document in advance + List ids = new ArrayList(); + for (SolrInputDocument solrdoc : solrdocs) ids.add((String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + try { + this.server.deleteById(ids); + } catch (SolrServerException e1) {} + try { + synchronized (this.server) { + this.server.add(solrdocs, -1); + } + } catch (Throwable ee) { + log.warn(e.getMessage() + " IDs=" + ids.toString()); + throw new IOException(ee); + } + } + } + } diff --git a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java index 46d804e0e..b7c76a076 100644 --- a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java +++ b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java @@ -21,8 +21,8 @@ package net.yacy.cora.federate.solr.instance; import java.util.Collection; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.federate.solr.connector.CachedSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; @@ -36,12 +36,16 @@ public class InstanceMirror { private ShardInstance solr1; private CachedSolrConnector defaultConnector; private Map connectorCache; + private EmbeddedSolrConnector defaultEmbeddedConnector; + private Map embeddedCache; public InstanceMirror() { this.solr0 = null; this.solr1 = null; this.defaultConnector = null; - this.connectorCache = new HashMap(); + this.connectorCache = new ConcurrentHashMap(); + this.defaultEmbeddedConnector = null; + this.embeddedCache = new ConcurrentHashMap(); } public boolean isConnected0() { @@ -50,8 +54,10 @@ public class InstanceMirror { public void connect0(EmbeddedInstance c) { for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr0 = c; } @@ -62,8 +68,10 @@ public class InstanceMirror { public void disconnect0() { if (this.solr0 == null) return; for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr0.close(); this.solr0 = null; } @@ -74,8 +82,10 @@ public class InstanceMirror { public void connect1(ShardInstance c) { for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr1 = c; } @@ -86,8 +96,10 @@ public class InstanceMirror { public void disconnect1() { if (this.solr1 == null) return; for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr1.close(); this.solr1 = null; } @@ -108,8 +120,23 @@ public class InstanceMirror { if (this.solr1 != null) return this.solr1.getCoreNames(); return null; } + + public EmbeddedSolrConnector getDefaultEmbeddedConnector() { + if (this.defaultEmbeddedConnector != null) return this.defaultEmbeddedConnector; + this.defaultEmbeddedConnector = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0); + this.embeddedCache.put(this.getDefaultCoreName(), this.defaultEmbeddedConnector); + return this.defaultEmbeddedConnector; + } + + public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + EmbeddedSolrConnector ec = this.embeddedCache.get(corename); + if (ec != null) return ec; + ec = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename); + this.embeddedCache.put(corename, ec); + return ec; + } - public SolrConnector getDefaultConnector() { + public SolrConnector getDefaultMirrorConnector() { if (this.defaultConnector != null) return this.defaultConnector; String defaultCoreName = this.getDefaultCoreName(); if (defaultCoreName == null) return null; @@ -120,7 +147,7 @@ public class InstanceMirror { return this.defaultConnector; } - public SolrConnector getConnector(String corename) { + public SolrConnector getMirrorConnector(String corename) { CachedSolrConnector msc = this.connectorCache.get(corename); if (msc != null) return msc; EmbeddedSolrConnector esc = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename); diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index cbbf3a95d..e3453fcb2 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -40,10 +40,10 @@ import java.util.Vector; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.NumberTools; +import net.yacy.kelondro.data.meta.DigestURI; /** @@ -560,7 +560,7 @@ public class HeaderFramework extends TreeMap implements Map conProp) throws MalformedURLException { + public static DigestURI getRequestURL(final HashMap conProp) throws MalformedURLException { String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given @@ -574,7 +574,7 @@ public class HeaderFramework extends TreeMap implements Map hyperlinks) { + public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks) { new Thread() { @Override public void run() { @@ -201,12 +201,12 @@ public final class CrawlStacker { }.start(); } - private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { - for (final Map.Entry e: hyperlinks.entrySet()) { + private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { + for (final Map.Entry e: hyperlinks.entrySet()) { if (e.getKey() == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) - final DigestURI url = DigestURI.toDigestURI(e.getKey()); + final DigestURI url = e.getKey(); final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); diff --git a/source/net/yacy/crawler/data/ResultImages.java b/source/net/yacy/crawler/data/ResultImages.java index e2ba6acf4..62d76ce5d 100644 --- a/source/net/yacy/crawler/data/ResultImages.java +++ b/source/net/yacy/crawler/data/ResultImages.java @@ -61,7 +61,7 @@ public class ResultImages { if (MemoryControl.shortStatus()) clearQueues(); limitQueues(1000); - final Map images = document.getImages(); + final Map images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup if (image == null || image.url() == null) continue; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 9c9d73ec1..b50a78dc7 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -152,7 +151,7 @@ public final class HTTPLoader { } // normalize URL - final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); // restart crawling with new url this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); @@ -172,7 +171,7 @@ public final class HTTPLoader { } // check if the url was already indexed - final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash()); + final HarvestProcess dbname = this.sb.urlExists(ASCII.String(redirectionUrl.hash())); if (dbname != null) { // customer request this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode); throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString()); @@ -293,7 +292,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); // if we are already doing a shutdown we don't need to retry crawling diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 3886fffae..265ccced9 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; @@ -90,7 +91,7 @@ public class RSSLoader extends Thread { try { final DigestURI messageurl = new DigestURI(message.getLink()); if (indexTriggered.containsKey(messageurl.hash())) continue loop; - if (sb.urlExists(messageurl.hash()) != null) continue loop; + if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; sb.addToIndex(messageurl, null, null); indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); loadCount++; diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index ddbe84045..9f31560e0 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -82,7 +82,7 @@ public class SitemapImporter extends Thread { // check if the url is known and needs to be recrawled Date lastMod = entry.lastmod(null); if (lastMod != null) { - final HarvestProcess dbocc = this.sb.urlExists(nexturlhash); + final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); if (dbocc != null && dbocc == HarvestProcess.LOADED) { // the url was already loaded. we need to check the date final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index ac22158de..ade6df6db 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -48,7 +48,6 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Tag; @@ -134,9 +133,9 @@ public class BookmarkHelper { int importCount = 0; - Map links = new HashMap(); + Map links = new HashMap(); String title; - MultiProtocolURI url; + DigestURI url; Bookmark bm; final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { @@ -148,14 +147,14 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (final Entry link: links.entrySet()) { + for (final Entry link: links.entrySet()) { url = link.getKey(); title = link.getValue().getProperty("name", ""); Log.logInfo("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed title = url.toString(); } - bm = db.new Bookmark(DigestURI.toDigestURI(url)); + bm = db.new Bookmark(url); bm.setProperty(Bookmark.BOOKMARK_TITLE, title); bm.setTags(tags); bm.setPublic(importPublic); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 39e184024..ddc1279ce 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -50,6 +50,7 @@ import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.logging.Log; @@ -113,7 +114,7 @@ public final class Condenser { // add the URL components to the word list insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getTextString(), meaningLib, doAutotagging); // the phrase counter: @@ -163,7 +164,7 @@ public final class Condenser { if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index ca7a7df43..70fea62b6 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -68,7 +68,7 @@ import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURI source; // the source url + private final DigestURI source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field @@ -78,13 +78,14 @@ public class Document { private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) - private final Map rss; // all embedded rss feeds - private final Map images; // all visible pictures in document + private final Map anchors; // all links embedded as clickeable entities (anchor tags) + private final Map rss; // all embedded rss feeds + private final Map images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks, inboundlinks, outboundlinks; + private Map audiolinks, videolinks, applinks, hyperlinks; + private Map inboundlinks, outboundlinks; private Map emaillinks; private MultiProtocolURI favicon; private boolean resorted; @@ -103,9 +104,9 @@ public class Document { final String[] sections, final String abstrct, final double lon, final double lat, final Object text, - final Map anchors, - final Map rss, - final Map images, + final Map anchors, + final Map rss, + final Map images, final boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; @@ -120,9 +121,9 @@ public class Document { this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.lon = lon; this.lat = lat; - this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.rss = (rss == null) ? new HashMap(0) : rss; - this.images = (images == null) ? new HashMap() : images; + this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.rss = (rss == null) ? new HashMap(0) : rss; + this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; @@ -397,13 +398,13 @@ dc_rights return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.anchors; } - public Map getRSS() { + public Map getRSS() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.rss; @@ -412,30 +413,30 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); return this.hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!this.resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!this.resorted) resortLinks(); return this.videolinks; } - public Map getImages() { + public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!this.resorted) resortLinks(); return this.images; } - public Map getApplinks() { + public Map getApplinks() { if (!this.resorted) resortLinks(); return this.applinks; } @@ -459,23 +460,23 @@ dc_rights synchronized (this) { if (this.resorted) return; // extract hyperlinks, medialinks and emaillinks from anchorlinks - MultiProtocolURI url; + DigestURI url; String u; int extpos, qpos; String ext = null; final String thishost = this.source.getHost(); - this.inboundlinks = new HashMap(); - this.outboundlinks = new HashMap(); - this.hyperlinks = new HashMap(); - this.videolinks = new HashMap(); - this.audiolinks = new HashMap(); - this.applinks = new HashMap(); + this.inboundlinks = new HashMap(); + this.outboundlinks = new HashMap(); + this.hyperlinks = new HashMap(); + this.videolinks = new HashMap(); + this.audiolinks = new HashMap(); + this.applinks = new HashMap(); this.emaillinks = new HashMap(); - final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + for (final Map.Entry entry: collectedImages.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } - for (final Map.Entry entry: this.anchors.entrySet()) { + for (final Map.Entry entry: this.anchors.entrySet()) { url = entry.getKey(); if (url == null) continue; final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; @@ -585,23 +586,23 @@ dc_rights return v; } - private static Map allReflinks(final Collection links) { + private static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final Map v = new HashMap(); + final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; - MultiProtocolURI url = null; + DigestURI url = null; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof MultiProtocolURI) - url = (MultiProtocolURI) o; + if (o instanceof DigestURI) + url = (DigestURI) o; else if (o instanceof String) - url = new MultiProtocolURI((String) o); + url = new DigestURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { @@ -615,7 +616,7 @@ dc_rights u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); - url = new MultiProtocolURI(u); + url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -625,7 +626,7 @@ dc_rights u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); - url = new MultiProtocolURI(u); + url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -689,12 +690,12 @@ dc_rights return c; } - public Set inboundLinks() { + public Set inboundLinks() { if (this.inboundlinks == null) resortLinks(); return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); } - public Set outboundLinks() { + public Set outboundLinks() { if (this.outboundlinks == null) resortLinks(); return (this.outboundlinks == null) ? null : this.outboundlinks.keySet(); } @@ -764,9 +765,7 @@ dc_rights * @param docs * @return */ - public static Document mergeDocuments(final DigestURI location, - final String globalMime, final Document[] docs) - { + public static Document mergeDocuments(final DigestURI location, final String globalMime, final Document[] docs) { if (docs == null || docs.length == 0) return null; if (docs.length == 1) return docs[0]; @@ -778,9 +777,9 @@ dc_rights final StringBuilder description = new StringBuilder(80); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); - final Map anchors = new HashMap(); - final Map rss = new HashMap(); - final Map images = new HashMap(); + final Map anchors = new HashMap(); + final Map rss = new HashMap(); + final Map images = new HashMap(); double lon = 0.0d, lat = 0.0d; for (final Document doc: docs) { @@ -856,22 +855,22 @@ dc_rights false); } - public static Map getHyperlinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getHyperlinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { result.putAll(d.getHyperlinks()); final Object parser = d.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; String refresh = html.getRefreshPath(); - if (refresh != null && refresh.length() > 0)try {result.put(new MultiProtocolURI(refresh), "refresh");} catch (MalformedURLException e) {} + if (refresh != null && refresh.length() > 0)try {result.put(new DigestURI(refresh), "refresh");} catch (MalformedURLException e) {} } } return result; } - public static Map getImagelinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getImagelinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { for (final ImageEntry imageReference : d.getImages().values()) { // construct a image name which contains the document title to enhance the search process for images @@ -881,30 +880,30 @@ dc_rights return result; } - public static Map getAudiolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getAudiolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.audiolinks.entrySet()) { + for (Map.Entry e: d.audiolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getVideolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getVideolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.videolinks.entrySet()) { + for (Map.Entry e: d.videolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getApplinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getApplinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.applinks.entrySet()) { + for (Map.Entry e: d.applinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 502e63da3..55fbe8a75 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -53,6 +53,7 @@ import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -121,11 +122,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links - private final Map anchors; - private final Map rss, css; - private final Set script, frames, iframes; - private final Map embeds; // urlhash/embed relation - private final Map images; // urlhash/image relation + private final Map anchors; + private final Map rss, css; + private final Set script, frames, iframes; + private final Map embeds; // urlhash/embed relation + private final Map images; // urlhash/image relation private final Map metas; private LinkedHashSet titles; //private String headline; @@ -135,7 +136,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; - private MultiProtocolURI canonical; + private DigestURI canonical; private final int maxLinks; private int breadcrumbs; @@ -148,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { /** * The document root {@link MultiProtocolURI} */ - private MultiProtocolURI root; + private DigestURI root; /** * evaluation scores: count appearance of specific attributes @@ -156,7 +157,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Evaluation evaluationScores; @SuppressWarnings("unchecked") - public ContentScraper(final MultiProtocolURI root, int maxLinks) { + public ContentScraper(final DigestURI root, int maxLinks) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -164,15 +165,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.evaluationScores = new Evaluation(); - this.rss = new SizeLimitedMap(maxLinks); - this.css = new SizeLimitedMap(maxLinks); - this.anchors = new SizeLimitedMap(maxLinks); - this.images = new SizeLimitedMap(maxLinks); - this.embeds = new SizeLimitedMap(maxLinks); - this.frames = new SizeLimitedSet(maxLinks); - this.iframes = new SizeLimitedSet(maxLinks); + this.rss = new SizeLimitedMap(maxLinks); + this.css = new SizeLimitedMap(maxLinks); + this.anchors = new SizeLimitedMap(maxLinks); + this.images = new SizeLimitedMap(maxLinks); + this.embeds = new SizeLimitedMap(maxLinks); + this.frames = new SizeLimitedSet(maxLinks); + this.iframes = new SizeLimitedSet(maxLinks); this.metas = new SizeLimitedMap(maxLinks); - this.script = new SizeLimitedSet(maxLinks); + this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); this.headlines = new ArrayList[6]; for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); @@ -194,7 +195,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.content.trimToSize(); } - private void mergeAnchors(final MultiProtocolURI url, final Properties p) { + private void mergeAnchors(final DigestURI url, final Properties p) { final Properties p0 = this.anchors.get(url); if (p0 == null) { this.anchors.put(url, p); @@ -282,7 +283,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // find http links inside text s = 0; String u; - MultiProtocolURI url; + DigestURI url; while (s < b.length()) { p = find(b, dpssp, s); if (p == Integer.MAX_VALUE) break; @@ -294,7 +295,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 6; try { - url = new MultiProtocolURI(u); + url = new DigestURI(u); mergeAnchors(url, new Properties()); continue; } catch (final MalformedURLException e) {} @@ -317,9 +318,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { return (p < 0) ? Integer.MAX_VALUE : p; } - private MultiProtocolURI absolutePath(final String relativePath) { + private DigestURI absolutePath(final String relativePath) { try { - return MultiProtocolURI.newURL(this.root, relativePath); + return DigestURI.newURL(this.root, relativePath); } catch (final Exception e) { return null; } @@ -331,7 +332,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final MultiProtocolURI url = absolutePath(src); + final DigestURI url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); @@ -343,10 +344,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.imgpath, src); } else if(tagname.equalsIgnoreCase("base")) { try { - this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING)); + this.root = new DigestURI(tagopts.getProperty("href", EMPTY_STRING)); } catch (final MalformedURLException e) {} } else if (tagname.equalsIgnoreCase("frame")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); mergeAnchors(src, tagopts /* with property "name" */); this.frames.add(src); @@ -384,13 +385,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String href = tagopts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { tagopts.put("nme", areatitle); - MultiProtocolURI url = absolutePath(href); + DigestURI url = absolutePath(href); tagopts.put("href", url.toNormalform(true)); mergeAnchors(url, tagopts); } } else if (tagname.equalsIgnoreCase("link")) { final String href = tagopts.getProperty("href", EMPTY_STRING); - final MultiProtocolURI newLink = absolutePath(href); + final DigestURI newLink = absolutePath(href); if (newLink != null) { tagopts.put("href", newLink.toNormalform(true)); @@ -420,7 +421,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final MultiProtocolURI url = absolutePath(src); + final DigestURI url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); @@ -434,12 +435,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if(tagname.equalsIgnoreCase("param")) { final String name = tagopts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); + DigestURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); tagopts.put("value", url.toNormalform(true)); mergeAnchors(url, tagopts /* with property "name" */); } } else if (tagname.equalsIgnoreCase("iframe")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); mergeAnchors(src, tagopts /* with property "name" */); this.iframes.add(src); @@ -459,7 +460,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", EMPTY_STRING); - MultiProtocolURI url; + DigestURI url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String f = url.getFileName(); final int p = f.lastIndexOf('.'); @@ -552,7 +553,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } catch (IOException e) { } } - for (final Map.Entry entry: scraper.getAnchors().entrySet()) { + for (final Map.Entry entry: scraper.getAnchors().entrySet()) { mergeAnchors(entry.getKey(), entry.getValue()); } this.images.putAll(scraper.images); @@ -640,15 +641,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } - public MultiProtocolURI[] getFlash() { + public DigestURI[] getFlash() { String ext; - ArrayList f = new ArrayList(); - for (final MultiProtocolURI url: this.anchors.keySet()) { + ArrayList f = new ArrayList(); + for (final DigestURI url: this.anchors.keySet()) { ext = url.getFileExtension(); if (ext == null) continue; if (ext.equals("swf")) f.add(url); } - return f.toArray(new MultiProtocolURI[f.size()]); + return f.toArray(new DigestURI[f.size()]); } public boolean containsFlash() { @@ -674,36 +675,36 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public Map getAnchors() { + public Map getAnchors() { // returns a url (String) / name (String) relation return this.anchors; } - public Map getRSS() { + public Map getRSS() { // returns a url (String) / name (String) relation return this.rss; } - public Map getCSS() { + public Map getCSS() { // returns a url (String) / name (String) relation return this.css; } - public Set getFrames() { + public Set getFrames() { // returns a url (String) / name (String) relation return this.frames; } - public Set getIFrames() { + public Set getIFrames() { // returns a url (String) / name (String) relation return this.iframes; } - public Set getScript() { + public Set getScript() { return this.script; } - public MultiProtocolURI getCanonical() { + public DigestURI getCanonical() { return this.canonical; } @@ -711,11 +712,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { * get all images * @return a map of */ - public Map getImages() { + public Map getImages() { return this.images; } - public Map getEmbeds() { + public Map getEmbeds() { return this.embeds; } @@ -970,29 +971,29 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURI("http://localhost"),null,false, maxLinks); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"), maxLinks); + final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost"), maxLinks); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); return scraper; } - public static void addAllImages(final Map a, final Map b) { - final Iterator> i = b.entrySet().iterator(); - Map.Entry ie; + public static void addAllImages(final Map a, final Map b) { + final Iterator> i = b.entrySet().iterator(); + Map.Entry ie; while (i.hasNext()) { ie = i.next(); addImage(a, ie.getValue()); } } - public static void addImage(final Map a, final ImageEntry ie) { + public static void addImage(final Map a, final ImageEntry ie) { if (a.containsKey(ie.url())) { // in case of a collision, take that image that has the better image size tags if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); diff --git a/source/net/yacy/document/parser/html/EmbedEntry.java b/source/net/yacy/document/parser/html/EmbedEntry.java index f620f507f..cbaaffb2a 100644 --- a/source/net/yacy/document/parser/html/EmbedEntry.java +++ b/source/net/yacy/document/parser/html/EmbedEntry.java @@ -20,15 +20,15 @@ package net.yacy.document.parser.html; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.data.meta.DigestURI; public class EmbedEntry { - private final MultiProtocolURI url; + private final DigestURI url; private final int width, height; private final String type, pluginspage; - public EmbedEntry(final MultiProtocolURI url, int width, int height, String type, String pluginspage) { + public EmbedEntry(final DigestURI url, int width, int height, String type, String pluginspage) { this.url = url; this.width = width; this.height = height; @@ -36,7 +36,7 @@ public class EmbedEntry { this.pluginspage = pluginspage; } - public MultiProtocolURI getUrl() { + public DigestURI getUrl() { return this.url; } diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index e795fa5f5..37419fffc 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -26,16 +26,16 @@ package net.yacy.document.parser.html; import java.util.Comparator; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.data.meta.DigestURI; public class ImageEntry implements Comparable, Comparator { - private final MultiProtocolURI url; + private final DigestURI url; private final String alt; private final int width, height; private final long fileSize; - public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) { + public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) { assert url != null; this.url = url; this.alt = alt; @@ -44,7 +44,7 @@ public class ImageEntry implements Comparable, Comparator languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); + final HashMap anchors = new HashMap(); + final HashMap images = new HashMap(); // add this image to the map of images final String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); @@ -223,7 +223,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final MultiProtocolURI location, + final DigestURI location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; try { @@ -238,7 +238,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final MultiProtocolURI location, + final DigestURI location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); ii.image = image; @@ -275,12 +275,12 @@ public class genericImageParser extends AbstractParser implements Parser { } public static class ImageInfo { - public MultiProtocolURI location; + public DigestURI location; public BufferedImage image; public StringBuilder info; public int height; public int width; - public ImageInfo(final MultiProtocolURI location) { + public ImageInfo(final DigestURI location) { this.location = location; this.image = null; this.info = new StringBuilder(); diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 4917f0fdd..8c471e388 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -37,7 +37,6 @@ import java.util.Properties; import java.util.Set; import net.yacy.cora.document.Hit; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSReader; import net.yacy.document.AbstractParser; @@ -75,13 +74,13 @@ public class rssParser extends AbstractParser implements Parser { final List docs = new ArrayList(); DigestURI uri; Set languages; - Map anchors; + Map anchors; Document doc; for (final Hit item: feed) try { uri = new DigestURI(item.getLink()); languages = new HashSet(); languages.add(item.getLanguage()); - anchors = new HashMap(); + anchors = new HashMap(); Properties p = new Properties(); p.put("name", item.getTitle()); anchors.put(uri, p); @@ -102,7 +101,7 @@ public class rssParser extends AbstractParser implements Parser { null, anchors, null, - new HashMap(), + new HashMap(), false); docs.add(doc); } catch (MalformedURLException e) { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index b4812211a..6e9204fa6 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -33,7 +33,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -168,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser { Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath)); + final DigestURI url = DigestURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 359766e0d..238fed1f4 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -40,7 +40,6 @@ import java.util.zip.GZIPInputStream; import javax.xml.parsers.DocumentBuilderFactory; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -96,7 +95,7 @@ public class sitemapParser extends AbstractParser implements Parser { null, null, null, - new HashMap(), + new HashMap(), false); docs.add(doc); } catch (MalformedURLException e) { diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 58d80e399..2f974aa2d 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -33,7 +33,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Properties; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -79,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser { final String[] sections = null; final String abstrct = null; //TreeSet images = null; - final Map anchors = new HashMap(); + final Map anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; @@ -98,7 +97,7 @@ public class swfParser extends AbstractParser implements Parser { urlnr = Integer.toString(++urls).toString(); final Properties p = new Properties(); p.put("name", urlnr); - anchors.put(new MultiProtocolURI(url), p); + anchors.put(new DigestURI(url), p); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 3e098d5c5..d2507cf15 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -33,7 +33,6 @@ import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -90,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp); + subDocs = TextParser.parseSource(DigestURI.newURL(url, "#" + name), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 8304a2e81..8ffe75658 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -37,7 +37,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Properties; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.document.AbstractParser; @@ -70,7 +69,7 @@ public class vcfParser extends AbstractParser implements Parser { final StringBuilder parsedTitle = new StringBuilder(); final StringBuilder parsedDataText = new StringBuilder(); final HashMap parsedData = new HashMap(); - final HashMap anchors = new HashMap(); + final HashMap anchors = new HashMap(); final LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; @@ -177,7 +176,7 @@ public class vcfParser extends AbstractParser implements Parser { parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { - final MultiProtocolURI newURL = new MultiProtocolURI(value); + final DigestURI newURL = new DigestURI(value); final Properties p = new Properties(); p.put("name", newURL.toString()); anchors.put(newURL, p); diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index a124bd946..dc1346140 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -32,7 +32,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -87,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name)); + final DigestURI virtualURL = DigestURI.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); docs = TextParser.parseSource(virtualURL, mime, null, tmp); if (docs == null) continue; diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index 190ef46d7..a6b9dba11 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -138,15 +138,12 @@ public class DigestURI extends MultiProtocolURI implements Serializable { * DigestURI from general URI * @param u */ + /* private DigestURI(final MultiProtocolURI u) { super(u); this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null; } - - - public static DigestURI toDigestURI(MultiProtocolURI u) { - return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u); - } + */ /** * DigestURI from general URI, hash already calculated @@ -168,6 +165,23 @@ public class DigestURI extends MultiProtocolURI implements Serializable { this.hash = null; } + public static DigestURI newURL(final DigestURI baseURL, String relPath) throws MalformedURLException { + if (relPath.startsWith("//")) { + // patch for urls starting with "//" which can be found in the wild + relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath; + } + if ((baseURL == null) || + isHTTP(relPath) || + isHTTPS(relPath) || + isFTP(relPath) || + isFile(relPath) || + isSMB(relPath)/*|| + relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { + return new DigestURI(relPath); + } + return new DigestURI(baseURL, relPath); + } + private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful @Override diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index e30bcaaa9..97633606c 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -169,7 +169,7 @@ public class Transmission { notFoundx.add(e.urlhash()); continue; } - if (!Transmission.this.segment.fulltext().exists(e.urlhash())) { + if (!Transmission.this.segment.fulltext().exists(ASCII.String(e.urlhash()))) { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } else { diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 2d1427d00..5e0cbd555 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -46,7 +46,6 @@ import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.sorting.ClusteredScoreMap; @@ -83,9 +82,9 @@ public class WebStructureGraph { private static class LearnObject { private final DigestURI url; - private final Set globalRefURLs; + private final Set globalRefURLs; - private LearnObject(final DigestURI url, final Set globalRefURLs) { + private LearnObject(final DigestURI url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } @@ -160,11 +159,11 @@ public class WebStructureGraph { public void generateCitationReference(final DigestURI url, final Document document) { // generate citation reference - final Map hl = document.getHyperlinks(); - final Iterator it = hl.keySet().iterator(); - final HashSet globalRefURLs = new HashSet(); + final Map hl = document.getHyperlinks(); + final Iterator it = hl.keySet().iterator(); + final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); - MultiProtocolURI u; + DigestURI u; int maxref = 1000; while ( it.hasNext() && maxref-- > 0 ) { u = it.next(); @@ -191,7 +190,7 @@ public class WebStructureGraph { } public void generateCitationReference(final DigestURI from, final DigestURI to) { - final HashSet globalRefURLs = new HashSet(); + final HashSet globalRefURLs = new HashSet(); final String refhost = from.getHost(); if (refhost != null && to.getHost() != null && !to.getHost().equals(refhost)) globalRefURLs.add(to); final LearnObject lro = new LearnObject(from, globalRefURLs); @@ -586,12 +585,10 @@ public class WebStructureGraph { private void learnrefs(final LearnObject lro) { final Set refhosts = new HashSet(); - DigestURI du; String hosthash; - for ( final MultiProtocolURI u : lro.globalRefURLs ) { + for ( final DigestURI u : lro.globalRefURLs ) { if (Switchboard.getSwitchboard().shallTerminate()) break; - du = DigestURI.toDigestURI(u); - hosthash = ASCII.String(du.hash(), 6, 6); + hosthash = ASCII.String(u.hash(), 6, 6); if (!exists(hosthash)) { // this must be recorded as an host with no references synchronized ( this.structure_new ) { diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index ae5a3e302..742e8b243 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -245,10 +245,10 @@ public final class yacyRelease extends yacyVersion { } // analyze links in scraper resource, and find link to latest release in it - final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); - for (final MultiProtocolURI url : anchors.keySet()) { + for (final DigestURI url : anchors.keySet()) { try { final yacyRelease release = new yacyRelease(url, location.getPublicKey()); //System.out.println("r " + release.toAnchor()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 4f8102f23..a65c4dae4 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -39,7 +39,6 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; @@ -381,7 +380,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { + public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 575889858..8ed22305f 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1545,12 +1545,12 @@ public final class Switchboard extends serverSwitch { return false; } - public HarvestProcess urlExists(final byte[] hash) { + public HarvestProcess urlExists(final String hash) { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned if (this.index.exists(hash)) return HarvestProcess.LOADED; - return this.crawlQueues.urlExists(hash); + return this.crawlQueues.urlExists(ASCII.getBytes(hash)); } public void urlRemove(final Segment segment, final byte[] hash) { @@ -2494,7 +2494,7 @@ public final class Switchboard extends serverSwitch { ) ) { // get the hyperlinks - final Map hl = Document.getHyperlinks(documents); + final Map hl = Document.getHyperlinks(documents); // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { @@ -2506,7 +2506,7 @@ public final class Switchboard extends serverSwitch { // insert those hyperlinks to the crawler MultiProtocolURI nextUrl; - for ( final Map.Entry nextEntry : hl.entrySet() ) { + for ( final Map.Entry nextEntry : hl.entrySet() ) { // check for interruption checkInterruption(); @@ -2654,7 +2654,7 @@ public final class Switchboard extends serverSwitch { // CREATE INDEX final String dc_title = document.dc_title(); - final DigestURI url = DigestURI.toDigestURI(document.dc_source()); + final DigestURI url = document.dc_source(); final DigestURI referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); @@ -2711,14 +2711,14 @@ public final class Switchboard extends serverSwitch { feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url(), ASCII.String(queueEntry.url().hash()))); // store rss feeds in document into rss table - for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { + for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { final Tables.Data rssRow = new Tables.Data(); rssRow.put("referrer", url.hash()); rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true))); rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("recording_date", new Date()); try { - this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow); + this.tables.update("rss", rssEntry.getKey().hash(), rssRow); } catch ( final IOException e ) { Log.logException(e); } @@ -2760,7 +2760,7 @@ public final class Switchboard extends serverSwitch { public final void addAllToIndex( final DigestURI url, - final Map links, + final Map links, final SearchEvent searchEvent, final String heuristicName) { @@ -2775,10 +2775,10 @@ public final class Switchboard extends serverSwitch { } // check if some of the links match with the query - final Map matcher = searchEvent.query.separateMatches(links); + final Map matcher = searchEvent.query.separateMatches(links); // take the matcher and load them all - for ( final Map.Entry entry : matcher.entrySet() ) { + for ( final Map.Entry entry : matcher.entrySet() ) { try { addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); } catch ( final IOException e ) { @@ -2787,7 +2787,7 @@ public final class Switchboard extends serverSwitch { } // take then the no-matcher and load them also - for ( final Map.Entry entry : links.entrySet() ) { + for ( final Map.Entry entry : links.entrySet() ) { try { addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); } catch ( final IOException e ) { @@ -2926,10 +2926,10 @@ public final class Switchboard extends serverSwitch { public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, Parser.Failure { - if ( searchEvent != null ) { + if (searchEvent != null) { searchEvent.addHeuristic(url.hash(), heuristicName, true); } - if ( this.index.exists(url.hash()) ) { + if (this.index.exists(ASCII.String(url.hash()))) { return; // don't do double-work } final Request request = this.loader.request(url, true, true); @@ -3004,7 +3004,7 @@ public final class Switchboard extends serverSwitch { */ public void addToCrawler(final DigestURI url, final boolean asglobal) { - if ( this.index.exists(url.hash()) ) { + if (this.index.exists(ASCII.String(url.hash()))) { return; // don't do double-work } final Request request = this.loader.request(url, true, true); @@ -3204,7 +3204,7 @@ public final class Switchboard extends serverSwitch { return "no DHT distribution: not enabled (per setting)"; } final Segment indexSegment = this.index; - int size = indexSegment.fulltext().size(); + long size = indexSegment.fulltext().collectionSize(); if ( size < 10 ) { return "no DHT distribution: loadedURL.size() = " + size; } @@ -3348,12 +3348,12 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; searchEvent.rankingProcess.oneFeederStarted(); try { links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if ( links != null ) { - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { if ( !i.next().getHost().endsWith(host) ) { i.remove(); @@ -3387,16 +3387,16 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; DigestURI url; try { links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false); while (i.hasNext()) { - url = DigestURI.toDigestURI(i.next()); + url = i.next(); boolean islocal = url.getHost().contentEquals(startUrl.getHost()); // add all external links or links to different page to crawler if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) { @@ -3458,11 +3458,11 @@ public final class Switchboard extends serverSwitch { //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { - final Map links = new TreeMap(); - MultiProtocolURI uri; + final Map links = new TreeMap(); + DigestURI uri; for ( final RSSMessage message : rss.getFeed() ) { try { - uri = new MultiProtocolURI(message.getLink()); + uri = new DigestURI(message.getLink()); links.put(uri, message.getTitle()); } catch ( final MalformedURLException e ) { } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 65342c7f3..ebcd82526 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.PrintWriter; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -195,9 +196,12 @@ public final class Fulltext { this.solrInstances.disconnect1(); } - public EmbeddedSolrConnector getDefaultLocalSolrConnector() { - if (this.solrInstances.getSolr0() == null) return null; - return new EmbeddedSolrConnector(this.solrInstances.getSolr0()); + public EmbeddedSolrConnector getDefaultEmbeddedConnector() { + return this.solrInstances.getDefaultEmbeddedConnector(); + } + + public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + return this.solrInstances.getEmbeddedConnector(corename); } public RemoteSolrConnector getDefaultRemoteSolrConnector() { @@ -210,11 +214,11 @@ public final class Fulltext { } public SolrConnector getDefaultConnector() { - return this.solrInstances.getDefaultConnector(); + return this.solrInstances.getDefaultMirrorConnector(); } public SolrConnector getWebgraphConnector() { - return this.solrInstances.getConnector(WebgraphSchema.CORE_NAME); + return this.solrInstances.getMirrorConnector(WebgraphSchema.CORE_NAME); } public void clearCache() { @@ -232,7 +236,7 @@ public final class Fulltext { this.urlIndexFile.clear(); } this.statsDump = null; - this.solrInstances.getDefaultConnector().commit(true); + this.commit(true); } public void clearLocalSolr() throws IOException { @@ -240,6 +244,7 @@ public final class Fulltext { if (instance != null) { for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear(); } + this.commit(false); this.solrInstances.clearCache(); } @@ -255,11 +260,19 @@ public final class Fulltext { * get the size of the default index * @return */ - public int size() { - int size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); - size += this.solrInstances.getDefaultConnector().getSize(); + public long collectionSize() { + long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); + size += this.getDefaultConnector().getSize(); return size; } + + /** + * get the size of the webgraph index + * @return + */ + public long webgraphSize() { + return this.getWebgraphConnector().getSize(); + } public void close() { this.statsDump = null; @@ -279,7 +292,7 @@ public final class Fulltext { if (urlHash == null) return null; Date x; try { - x = (Date) this.solrInstances.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName()); + x = (Date) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName()); } catch (IOException e) { return null; } @@ -290,7 +303,7 @@ public final class Fulltext { if (urlHash == null) return null; String x; try { - x = (String) this.solrInstances.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName()); + x = (String) this.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName()); } catch (IOException e) { return null; } @@ -317,7 +330,7 @@ public final class Fulltext { // get the metadata from Solr try { - SolrDocument doc = this.solrInstances.getDefaultConnector().getById(ASCII.String(urlHash)); + SolrDocument doc = this.getDefaultConnector().getById(ASCII.String(urlHash)); if (doc != null) { if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); return new URIMetadataNode(doc, wre, weight); @@ -346,17 +359,27 @@ public final class Fulltext { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); byte[] idb = ASCII.getBytes(id); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - Date sdDate = (Date) this.solrInstances.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); - Date docDate = null; - if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { + if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); + Date sdDate = (Date) this.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); + Date docDate = null; + if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) { // ip_s needs a dns lookup which causes blockings during search here - this.solrInstances.getDefaultConnector().add(doc); + this.getDefaultConnector().add(doc); } else synchronized (this.solrInstances) { - this.solrInstances.getDefaultConnector().add(doc); + this.getDefaultConnector().add(doc); } - } + } + } catch (SolrException e) { + throw new IOException(e.getMessage(), e); + } + this.statsDump = null; + if (MemoryControl.shortStatus()) clearCache(); + } + + public void putEdges(final Collection edges) throws IOException { + try { + this.getWebgraphConnector().add(edges); } catch (SolrException e) { throw new IOException(e.getMessage(), e); } @@ -371,13 +394,13 @@ public final class Fulltext { String id = ASCII.String(idb); try { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - SolrDocument sd = this.solrInstances.getDefaultConnector().getById(id); + SolrDocument sd = this.getDefaultConnector().getById(id); if (sd == null || (new URIMetadataNode(sd)).isOlder(row)) { if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) { // ip_s needs a dns lookup which causes blockings during search here - this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); + this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); } else synchronized (this.solrInstances) { - this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); + this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); } } } catch (SolrException e) { @@ -397,15 +420,22 @@ public final class Fulltext { public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain assert hosthash.length() == 6; - final String q = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final String collection1Query = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); + final String webgraphQuery = WebgraphSchema.source_host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solrInstances) { - try { - Fulltext.this.solrInstances.getDefaultConnector().deleteByQuery(q); - } catch (IOException e) {} + try {Fulltext.this.getDefaultConnector().deleteByQuery(collection1Query);} catch (IOException e) {} + try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {} } // delete in old metadata structure @@ -443,21 +473,30 @@ public final class Fulltext { }; if (concurrent) t.start(); else { t.run(); - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain - final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final String collectionQuery = + CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); + final String webgraphQuery = + WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + hostname + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solrInstances) { - try { - Fulltext.this.getDefaultConnector().deleteByQuery(q); - } catch (IOException e) {} + try {Fulltext.this.getDefaultConnector().deleteByQuery(collectionQuery);} catch (IOException e) {} + try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {} } // finally remove the line with statistics if (Fulltext.this.statsDump != null) { @@ -475,7 +514,7 @@ public final class Fulltext { }; if (concurrent) t.start(); else { t.run(); - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } @@ -489,12 +528,12 @@ public final class Fulltext { DigestURI uri; try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;} final String host = uri.getHost(); - final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + + final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentQuery(q, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentQuery(collectionQuery, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); try { SolrDocument doc; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -504,7 +543,7 @@ public final class Fulltext { count.incrementAndGet(); } } - if (count.get() > 0) Fulltext.this.getDefaultConnector().commit(true); + if (count.get() > 0) Fulltext.this.commit(true); } catch (InterruptedException e) {} } }; @@ -525,8 +564,9 @@ public final class Fulltext { synchronized (Fulltext.this.solrInstances) { for (byte[] urlHash: deleteIDs) { Fulltext.this.getDefaultConnector().delete(ASCII.String(urlHash)); + Fulltext.this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash)); } - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } catch (final Throwable e) { Log.logException(e); @@ -546,6 +586,7 @@ public final class Fulltext { try { synchronized (this.solrInstances) { this.getDefaultConnector().delete(ASCII.String(urlHash)); + this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash)); } } catch (final Throwable e) { Log.logException(e); @@ -560,11 +601,11 @@ public final class Fulltext { return false; } - public boolean exists(final byte[] urlHash) { + public boolean exists(final String urlHash) { if (urlHash == null) return false; - if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true; + if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true; try { - if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), ASCII.String(urlHash))) return true; + if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), urlHash)) return true; } catch (final Throwable e) { Log.logException(e); } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 94da14db4..55d068a83 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -195,7 +195,7 @@ public class Segment { } public long URLCount() { - return this.fulltext.size(); + return this.fulltext.collectionSize(); } public long RWICount() { @@ -219,7 +219,7 @@ public class Segment { return count; } - public boolean exists(final byte[] urlhash) { + public boolean exists(final String urlhash) { return this.fulltext.exists(urlhash); } @@ -284,16 +284,16 @@ public class Segment { return this.segmentPath; } - private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { + private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { if (anchors == null) return 0; int refCount = 0; // iterate over all outgoing links, this will create a context for those links final byte[] urlhash = url.hash(); final long urldate = urlModified.getTime(); - for (Map.Entry anchorEntry: anchors.entrySet()) { - MultiProtocolURI anchor = anchorEntry.getKey(); - byte[] refhash = DigestURI.toDigestURI(anchor).hash(); + for (Map.Entry anchorEntry: anchors.entrySet()) { + DigestURI anchor = anchorEntry.getKey(); + byte[] refhash = anchor.hash(); //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); if (this.urlCitationIndex != null) try { this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate)); @@ -377,7 +377,7 @@ public class Segment { // DO A SOFT/HARD COMMIT IF NEEDED if (MemoryControl.shortStatus()) { // do a 'hard' commit to flush index caches - this.fulltext.getDefaultConnector().commit(false); + this.fulltext.commit(false); } else { if ( (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_l) && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_unique_b)) || @@ -404,7 +404,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final SolrInputDocument solrInputDoc = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); // FIND OUT IF THIS IS A DOUBLE DOCUMENT for (CollectionSchema[] checkfields: new CollectionSchema[][]{ @@ -414,11 +414,11 @@ public class Segment { CollectionSchema uniquefield = checkfields[1]; if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { // lookup the document with the same signature - long signature = ((Long) solrInputDoc.getField(checkfield.getSolrFieldName()).getValue()).longValue(); + long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue(); try { if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), Long.toString(signature))) { // change unique attribut in content - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); } } catch (IOException e) {} } @@ -434,14 +434,14 @@ public class Segment { // lookup in the index for the same title String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); if (checkstring.length() == 0) { - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); continue uniquecheck; } checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\""); try { if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), checkstring)) { // switch unique attribute in new document - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); // switch attribute also in all existing documents (which should be exactly only one!) SolrDocumentList docs = this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); for (SolrDocument doc: docs) { @@ -450,7 +450,7 @@ public class Segment { this.fulltext.getDefaultConnector().add(sid); } } else { - solrInputDoc.setField(uniquefield.getSolrFieldName(), true); + vector.setField(uniquefield.getSolrFieldName(), true); } } catch (IOException e) {} } @@ -459,7 +459,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) { int references = this.urlCitationIndex.count(url.hash()); - if (references > 0) solrInputDoc.setField(CollectionSchema.references_i.getSolrFieldName(), references); + if (references > 0) vector.setField(CollectionSchema.references_i.getSolrFieldName(), references); } // STORE TO SOLR @@ -467,7 +467,20 @@ public class Segment { tryloop: for (int i = 0; i < 20; i++) { try { error = null; - this.fulltext.putDocument(solrInputDoc); + this.fulltext.putDocument(vector); + break tryloop; + } catch ( final IOException e ) { + error = "failed to send " + urlNormalform + " to solr"; + Log.logWarning("SOLR", error + e.getMessage()); + if (i == 10) this.fulltext.commit(false); + try {Thread.sleep(1000);} catch (InterruptedException e1) {} + continue tryloop; + } + } + tryloop: for (int i = 0; i < 20; i++) { + try { + error = null; + this.fulltext.putEdges(vector.getWebgraphDocuments()); break tryloop; } catch ( final IOException e ) { error = "failed to send " + urlNormalform + " to solr"; @@ -567,7 +580,7 @@ public class Segment { } // finished - return solrInputDoc; + return vector; } public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 339edd344..f05cb0372 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -46,7 +46,6 @@ import org.apache.solr.client.solrj.SolrQuery.ORDER; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -531,11 +530,11 @@ public final class QueryParams { return this.queryGoal; } - public final Map separateMatches(final Map links) { - final Map matcher = new HashMap(); - final Iterator > i = links.entrySet().iterator(); - Map.Entry entry; - MultiProtocolURI url; + public final Map separateMatches(final Map links) { + final Map matcher = new HashMap(); + final Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + DigestURI url; String anchorText; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 3f01e1e59..37860b6b0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -38,7 +38,6 @@ import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import net.yacy.cora.document.ASCII; @@ -158,7 +157,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrInputDocument metadata2solr(final URIMetadataRow md) { final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(md.url()); + final DigestURI digestURI = md.url(); boolean allAttr = this.isEmpty(); if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, ""); @@ -283,13 +282,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - public SolrInputDocument yacy2solr( + public static class SolrVector extends SolrInputDocument { + private static final long serialVersionUID = -210901881471714939L; + private List webgraphDocuments; + public SolrVector() { + super(); + this.webgraphDocuments = new ArrayList(); + } + public void addWebgraphDocument(SolrInputDocument webgraphDocument) { + this.webgraphDocuments.add(webgraphDocument); + } + public List getWebgraphDocuments() { + return this.webgraphDocuments; + } + } + + public SolrVector yacy2solr( final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language, - IndexCell citations) { + IndexCell citations, + WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema - final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source()); + SolrVector doc = new SolrVector(); + final DigestURI digestURI = document.dc_source(); boolean allAttr = this.isEmpty(); Set processTypes = new LinkedHashSet(); @@ -299,24 +314,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String docurl = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, docurl); + int clickdepth = -1; if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) { if (digestURI.probablyRootURL()) { boolean lc = this.lazy; this.lazy = false; - add(doc, CollectionSchema.clickdepth_i, 0); + clickdepth = 0; this.lazy = lc; } else { // search the citations for references - int clickdepth = -1; try { clickdepth = getClickDepth(citations, digestURI); } catch (IOException e) { add(doc, CollectionSchema.clickdepth_i, -1); } - add(doc, CollectionSchema.clickdepth_i, clickdepth); if (clickdepth < 0 || clickdepth > 1) { processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut } } + add(doc, CollectionSchema.clickdepth_i, clickdepth); } if (allAttr || contains(CollectionSchema.ip_s)) { @@ -415,12 +430,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema - Set inboundLinks = document.inboundLinks(); - Set outboundLinks = document.outboundLinks(); + Set inboundLinks = document.inboundLinks(); + Set outboundLinks = document.outboundLinks(); int c = 0; final Object parser = document.getParserObject(); - Map images = new HashMap(); + Map images = new HashMap(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; images = html.getImages(); @@ -546,11 +561,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // style sheets if (allAttr || contains(CollectionSchema.css_tag_txt)) { - final Map csss = html.getCSS(); + final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; c = 0; - for (final Map.Entry entry: csss.entrySet()) { + for (final Map.Entry entry: csss.entrySet()) { final String cssurl = entry.getKey().toNormalform(false); inboundLinks.remove(cssurl); outboundLinks.remove(cssurl); @@ -567,10 +582,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Scripts if (allAttr || contains(CollectionSchema.scripts_txt)) { - final Set scriptss = html.getScript(); + final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; - for (final MultiProtocolURI u: scriptss) { + for (final DigestURI u: scriptss) { inboundLinks.remove(u); outboundLinks.remove(u); scripts[c++] = u.toNormalform(false); @@ -581,10 +596,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Frames if (allAttr || contains(CollectionSchema.frames_txt)) { - final Set framess = html.getFrames(); + final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; - for (final MultiProtocolURI u: framess) { + for (final DigestURI u: framess) { inboundLinks.remove(u); outboundLinks.remove(u); frames[c++] = u.toNormalform(false); @@ -595,10 +610,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // IFrames if (allAttr || contains(CollectionSchema.iframes_txt)) { - final Set iframess = html.getIFrames(); + final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; - for (final MultiProtocolURI u: iframess) { + for (final DigestURI u: iframess) { inboundLinks.remove(u); outboundLinks.remove(u); iframes[c++] = u.toNormalform(false); @@ -609,7 +624,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // canonical tag if (allAttr || contains(CollectionSchema.canonical_t)) { - final MultiProtocolURI canonical = html.getCanonical(); + final DigestURI canonical = html.getCanonical(); if (canonical != null) { inboundLinks.remove(canonical); outboundLinks.remove(canonical); @@ -665,104 +680,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); } - // list all links - final Map alllinks = document.getAnchors(); - c = 0; + // statistics about the links if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size()); if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); - final List inboundlinksTag = new ArrayList(inboundLinks.size()); - final List inboundlinksURLProtocol = new ArrayList(inboundLinks.size()); - final List inboundlinksURLStub = new ArrayList(inboundLinks.size()); - final List inboundlinksName = new ArrayList(inboundLinks.size()); - final List inboundlinksRel = new ArrayList(inboundLinks.size()); - final List inboundlinksText = new ArrayList(inboundLinks.size()); - final List inboundlinksTextChars = new ArrayList(inboundLinks.size()); - final List inboundlinksTextWords = new ArrayList(inboundLinks.size()); - final List inboundlinksAltTag = new ArrayList(inboundLinks.size()); - for (final MultiProtocolURI u: inboundLinks) { - final Properties p = alllinks.get(u); - if (p == null) continue; - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = u.toNormalform(false); - final int pr = urls.indexOf("://",0); - inboundlinksURLProtocol.add(urls.substring(0, pr)); - inboundlinksURLStub.add(urls.substring(pr + 3)); - inboundlinksName.add(name.length() > 0 ? name : ""); - inboundlinksRel.add(rel.length() > 0 ? rel : ""); - inboundlinksText.add(text.length() > 0 ? text : ""); - inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); - inboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - inboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + - (name.length() > 0 ? " name=\"" + name + "\"" : "") + - ">" + - ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(u); - inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); - c++; - } - if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, inboundlinksTag); - if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(inboundlinksURLProtocol)); - if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, inboundlinksURLStub); - if (allAttr || contains(CollectionSchema.inboundlinks_name_txt)) add(doc, CollectionSchema.inboundlinks_name_txt, inboundlinksName); - if (allAttr || contains(CollectionSchema.inboundlinks_rel_sxt)) add(doc, CollectionSchema.inboundlinks_rel_sxt, inboundlinksRel); - if (allAttr || contains(CollectionSchema.inboundlinks_relflags_val)) add(doc, CollectionSchema.inboundlinks_relflags_val, relEval(inboundlinksRel)); - if (allAttr || contains(CollectionSchema.inboundlinks_text_txt)) add(doc, CollectionSchema.inboundlinks_text_txt, inboundlinksText); - if (allAttr || contains(CollectionSchema.inboundlinks_text_chars_val)) add(doc, CollectionSchema.inboundlinks_text_chars_val, inboundlinksTextChars); - if (allAttr || contains(CollectionSchema.inboundlinks_text_words_val)) add(doc, CollectionSchema.inboundlinks_text_words_val, inboundlinksTextWords); - if (allAttr || contains(CollectionSchema.inboundlinks_alttag_txt)) add(doc, CollectionSchema.inboundlinks_alttag_txt, inboundlinksAltTag); - - c = 0; if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); - final List outboundlinksTag = new ArrayList(outboundLinks.size()); - final List outboundlinksURLProtocol = new ArrayList(outboundLinks.size()); - final List outboundlinksURLStub = new ArrayList(outboundLinks.size()); - final List outboundlinksName = new ArrayList(outboundLinks.size()); - final List outboundlinksRel = new ArrayList(outboundLinks.size()); - final List outboundlinksTextChars = new ArrayList(outboundLinks.size()); - final List outboundlinksTextWords = new ArrayList(outboundLinks.size()); - final List outboundlinksText = new ArrayList(outboundLinks.size()); - final List outboundlinksAltTag = new ArrayList(outboundLinks.size()); - for (final MultiProtocolURI u: outboundLinks) { - final Properties p = alllinks.get(u); - if (p == null) continue; - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = u.toNormalform(false); - final int pr = urls.indexOf("://",0); - outboundlinksURLProtocol.add(urls.substring(0, pr)); - outboundlinksURLStub.add(urls.substring(pr + 3)); - outboundlinksName.add(name.length() > 0 ? name : ""); - outboundlinksRel.add(rel.length() > 0 ? rel : ""); - outboundlinksText.add(text.length() > 0 ? text : ""); - outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); - outboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - outboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + - (name.length() > 0 ? " name=\"" + name + "\"" : "") + - ">" + - ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(u); - inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); - c++; - } - if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, outboundlinksTag); - if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(outboundlinksURLProtocol)); - if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, outboundlinksURLStub); - if (allAttr || contains(CollectionSchema.outboundlinks_name_txt)) add(doc, CollectionSchema.outboundlinks_name_txt, outboundlinksName); - if (allAttr || contains(CollectionSchema.outboundlinks_rel_sxt)) add(doc, CollectionSchema.outboundlinks_rel_sxt, outboundlinksRel); - if (allAttr || contains(CollectionSchema.outboundlinks_relflags_val)) add(doc, CollectionSchema.outboundlinks_relflags_val, relEval(outboundlinksRel)); - if (allAttr || contains(CollectionSchema.outboundlinks_text_txt)) add(doc, CollectionSchema.outboundlinks_text_txt, outboundlinksText); - if (allAttr || contains(CollectionSchema.outboundlinks_text_chars_val)) add(doc, CollectionSchema.outboundlinks_text_chars_val, outboundlinksTextChars); - if (allAttr || contains(CollectionSchema.outboundlinks_text_words_val)) add(doc, CollectionSchema.outboundlinks_text_words_val, outboundlinksTextWords); - if (allAttr || contains(CollectionSchema.outboundlinks_alttag_txt)) add(doc, CollectionSchema.outboundlinks_alttag_txt, outboundlinksAltTag); - + + // list all links + WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks); + doc.webgraphDocuments.addAll(subgraph.edges); + if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]); + if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); + if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]); + if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, subgraph.tags[1]); + if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); + if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, subgraph.urlStubs[1]); + // charset if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); @@ -896,6 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param rel * @return binary encoded information about rel */ + /* private static List relEval(final List rel) { List il = new ArrayList(rel.size()); for (final String s: rel) { @@ -907,6 +841,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } return il; } + */ /** * register an entry as error document diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 784dedf84..53b635ad2 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -107,9 +107,13 @@ public enum CollectionSchema implements SchemaDeclaration { // bit 12: "unavailable_after" contained in http header properties robots_i(SolrType.num_integer, true, true, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), metagenerator_t(SolrType.text_general, true, true, false, "content of tag"), - inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"), inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"), + inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), + outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), + outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + /* inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"), inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"), inboundlinks_relflags_val(SolrType.num_integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"), @@ -117,9 +121,6 @@ public enum CollectionSchema implements SchemaDeclaration { inboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of characters"), inboundlinks_text_words_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of words"), inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), - outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), - outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), - outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"), outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"), outboundlinks_relflags_val(SolrType.num_integer, true, true, true, "external links, the rel property of the a-tag, coded binary"), @@ -127,6 +128,7 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of characters"), outboundlinks_text_words_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of words"), outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), + */ images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"), diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index cb78f65e5..573034785 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -27,10 +27,23 @@ package net.yacy.search.schema; import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Date; import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import org.apache.solr.common.SolrInputDocument; + +import net.yacy.cora.document.ASCII; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.CommonPattern; +import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -74,7 +87,178 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } } } + + public static class Subgraph { + public final ArrayList[] tags, urlProtocols, urlStubs; + public final ArrayList edges; + @SuppressWarnings("unchecked") + public Subgraph(int inboundSize, int outboundSize) { + this.tags = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlProtocols = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlStubs = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.edges = new ArrayList(inboundSize + outboundSize); + } + } + + public Subgraph edges( + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final Map alllinks, + final Map images, + final Set inboundLinks, + final Set outboundLinks + ) { + boolean allAttr = this.isEmpty(); + Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); + addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks); + addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks); + return subgraph; + } + + private void addEdges( + final Subgraph subgraph, + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final boolean allAttr, final Map alllinks, final Map images, + final boolean inbound, final Set links) { + for (final DigestURI target_url: links) { + final Properties p = alllinks.get(target_url); + if (p == null) continue; + final String name = p.getProperty("name", ""); // the name attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String rel = p.getProperty("rel", ""); // the rel-attribute + int ioidx = inbound ? 0 : 1; + + // index organization + StringBuilder idi = new StringBuilder(8); + idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase()); + while (idi.length() < 8) idi.insert(0, '0'); + String source_id = ASCII.String(source.hash()); + String target_id = ASCII.String(target_url.hash()); + StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi); + SolrInputDocument edge = new SolrInputDocument(); + add(edge, WebgraphSchema.id, id.toString()); + if (allAttr || contains(WebgraphSchema.load_date_dt)) { + Date loadDate = new Date(); + Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + add(edge, WebgraphSchema.load_date_dt, loadDate); + } + if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); + add(edge, WebgraphSchema.collection_sxt, collections); + // add the source attributes + add(edge, WebgraphSchema.source_id_s, source_id); + final String source_url_string = source.toNormalform(false); + int pr_source = source_url_string.indexOf("://",0); + if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); + if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3)); + Map source_searchpart = source.getSearchpartMap(); + if (source_searchpart == null) { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size()); + if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length()); + String source_host = null; + if ((source_host = source.getHost()) != null) { + String dnc = Domains.getDNC(source_host); + String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host); + if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash()); + if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension()); + if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath()); + if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) { + String[] paths = source.getPaths(); + add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.source_path_folders_sxt, paths); + } + add(edge, WebgraphSchema.source_clickdepth_i, clickdepth); + + // add the source attributes about the target + if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); + if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); + if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); + if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); + if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); + if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); + if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); + String tag = " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""; + subgraph.tags[ioidx].add(tag); + if (allAttr || contains(WebgraphSchema.target_tag_s)) add(edge, WebgraphSchema.target_tag_s, tag); + ImageEntry ientry = images.get(target_url); + String alttext = ientry == null ? "" : ientry.alt(); + if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); + if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); + if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); + + // add the target attributes + add(edge, WebgraphSchema.target_id_s, target_id); + final String target_url_string = target_url.toNormalform(false); + int pr_target = target_url_string.indexOf("://",0); + subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); + if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); + subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); + if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); + Map target_searchpart = target_url.getSearchpartMap(); + if (target_searchpart == null) { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size()); + if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length()); + String target_host = null; + if ((target_host = target_url.getHost()) != null) { + String dnc = Domains.getDNC(target_host); + String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host); + if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash()); + if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension()); + if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath()); + if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) { + String[] paths = target_url.getPaths(); + add(edge, WebgraphSchema.target_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.target_path_folders_sxt, paths); + } + add(edge, WebgraphSchema.target_clickdepth_i, clickdepth); + + // add the edge to the subgraph + subgraph.edges.add(edge); + } + } + + /** + * encode a string containing attributes from anchor rel properties binary: + * bit 0: "me" contained in rel + * bit 1: "nofollow" contained in rel + * @param rel + * @return binary encoded information about rel + */ + private static int relEval(final String rels) { + int i = 0; + final String s0 = rels.toLowerCase().trim(); + if ("me".equals(s0)) i += 1; + if ("nofollow".equals(s0)) i += 2; + return i; + } /** * save configuration to file and update enum SolrFields diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 74a21ddf9..4202f2e08 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -30,16 +30,19 @@ import org.apache.solr.common.SolrInputDocument; public enum WebgraphSchema implements SchemaDeclaration { + // index organisation id(SolrType.string, true, true, false, "primary key of document, a combination of (28 characters)"), + last_modified(SolrType.date, true, true, false, "last-modified from http header"), + load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"), collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), + // source information source_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (source)"), - source_url_s(SolrType.string, true, true, false, "the url of the document (source)"), + source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"), + source_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (source)"), source_file_ext_s(SolrType.string, true, true, false, "the file name extension (source)"), - source_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as - tag with anchor text and nofollow (source)"), source_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (source)"), - source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"), - source_path_s(SolrType.string, true, true, true, "path of the url (source)"), + source_path_s(SolrType.string, true, true, false, "path of the url (source)"), source_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (source)"), source_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (source)"), source_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (source)"), @@ -47,12 +50,14 @@ public enum WebgraphSchema implements SchemaDeclaration { source_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url (source)"), source_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), - source_host_s(SolrType.string, true, true, false, "host of the url"), + source_host_s(SolrType.string, true, true, false, "host of the url (source)"), + source_host_id_s(SolrType.string, true, true, false, "id of the host (source)"), source_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"), source_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"), source_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (source)"), source_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (source)"), + // information in the source about the target target_linktext_t(SolrType.text_general, true, true, false, "the text content of the a-tag (in source, but pointing to a target)"), target_linktext_charcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"), target_linktext_wordcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"), @@ -63,14 +68,15 @@ public enum WebgraphSchema implements SchemaDeclaration { target_rel_s(SolrType.string, true, true, false, "the rel property of the a-tag (in source, but pointing to a target)"), target_relflags_i(SolrType.num_integer, true, true, false, "the rel property of the a-tag, coded binary (in source, but pointing to a target)"), + // target information target_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (target)"), - target_url_s(SolrType.string, true, true, false, "the url of the document (target)"), + target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"), + target_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (target)"), target_file_ext_s(SolrType.string, true, true, false, "the file name extension (target)"), target_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as - tag with anchor text and nofollow (target)"), target_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (target)"), - target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"), - target_path_s(SolrType.string, true, true, true, "path of the url (target)"), - target_path_folders_count_i(SolrType.num_integer, true, true, true, "count of all path elements in the url (target)"), + target_path_s(SolrType.string, true, true, false, "path of the url (target)"), + target_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (target)"), target_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (target)"), target_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (target)"), target_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url (target)"), @@ -78,11 +84,14 @@ public enum WebgraphSchema implements SchemaDeclaration { target_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), target_host_s(SolrType.string, true, true, false, "host of the url (target)"), + target_host_id_s(SolrType.string, true, true, false, "id of the host (target)"), target_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"), target_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain (target)"), target_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (target)"), - target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"); - + target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"), + + target_inbound_b(SolrType.bool, true, true, false, "flag shows if the target host is equal to the source host"); + public final static String CORE_NAME = "webgraph"; public final static String VOCABULARY_PREFIX = "vocabulary_"; diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 740290846..351455a12 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -38,7 +38,6 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; @@ -164,20 +163,20 @@ public class MediaSnippet implements Comparable, Comparator computeMediaSnippets(final DigestURI source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks(); else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks(); else if (mediatype == ContentDomain.APP) media = document.getApplinks(); if (media == null) return null; - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; DigestURI url; String desc; final List result = new ArrayList(); while (i.hasNext()) { entry = i.next(); - url = DigestURI.toDigestURI(entry.getKey()); + url = entry.getKey(); desc = entry.getValue(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() + @@ -202,7 +201,7 @@ public class MediaSnippet implements Comparable, Comparator result = new ArrayList(); while (i.hasNext()) { ientry = i.next(); - url = DigestURI.toDigestURI(ientry.url()); + url = ientry.url(); final String u = url.toString(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue; diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index c0cec40e4..17fdba890 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -309,7 +309,7 @@ public final class HTTPDProxyHandler { DigestURI url = null; try { - url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp)); + url = HeaderFramework.getRequestURL(conProp); if (log.isFine()) log.logFine(reqID +" GET "+ url); if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); @@ -392,7 +392,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : requestHeader.referer().hash(), "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), @@ -528,7 +528,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : requestHeader.referer().hash(), "", responseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(),
    DatabaseEntriesDatabaseEntries
    Pages (URLs)Documents
    solr search api
    #[urlpublictextSize]#
    RWIs (Words)Webgraph Edges
    solr search api
    #[webgraphSize]#
    RWIs
    (P2P Chunks)
    #[rwipublictextSize]#