diff --git a/.classpath b/.classpath index 0b3ea7ce7..f832ab578 100644 --- a/.classpath +++ b/.classpath @@ -1,7 +1,7 @@ - + @@ -10,7 +10,7 @@ - + diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index bc045020c..ac1f9f532 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -214,27 +214,6 @@ inboundlinks_protocol_sxt ## internal links, the url only without the protocol inboundlinks_urlstub_txt -## internal links, the name property of the a-tag -#inboundlinks_name_txt - -## internal links, the rel property of the a-tag -#inboundlinks_rel_sxt - -## internal links, the rel property of the a-tag, coded binary -#inboundlinks_relflags_val - -## internal links, the text content of the a-tag -#inboundlinks_text_txt - -## internal links, the length of the a-tag as number of characters -#inboundlinks_text_chars_val - -## internal links, the length of the a-tag as number of words -#inboundlinks_text_words_val - -##if the link is an image link, this contains the alt tag if the image is also liked as img link -#inboundlinks_alttag_txt - ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow #outboundlinks_tag_txt @@ -244,27 +223,6 @@ outboundlinks_protocol_sxt ## external links, the url only without the protocol outboundlinks_urlstub_txt -## external links, the name property of the a-tag -#outboundlinks_name_txt - -## external links, the rel property of the a-tag -#outboundlinks_rel_sxt - -## external links, the rel property of the a-tag, coded binary -#outboundlinks_relflags_val - -## external links, the text content of the a-tag -#outboundlinks_text_txt - -## external links, the length of the a-tag as number of characters -#outboundlinks_text_chars_val - -## external links, the length of the a-tag as number of words -#outboundlinks_text_words_val - -##if the link is an image link, this contains the alt tag if the image is also liked as img link -#outboundlinks_alttag_txt - ## all image tags, encoded as tag inclusive alt- and title property #images_tag_txt diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index f48931681..7e69a7c8a 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -15,6 +15,12 @@ ## primary key of document, a combination of (28 characters) id +## last-modified from http header, date (mandatory field) +last_modified + +## time when resource was loaded +load_date_dt + ## tags that are attached to crawls/index generation to separate the search result into user-defined subsets collection_sxt @@ -26,21 +32,18 @@ collection_sxt ## primary key of document, the URL hash (source) source_id_s -## the url of the document (source) -#source_url_s +## the protocol of the url (source) +#source_protocol_s + +## the url without the protocol (source) +#source_urlstub_s ## the file name extension (source) #source_file_ext_s -## normalized (absolute URLs), as - tag with anchor text and nofollow (source) -#source_tag_s - ## number of all characters in the url (source) #source_chars_i -## the protocol of the url (source) -#source_protocol_s - ## path of the url (source) #source_path_s @@ -62,9 +65,12 @@ source_id_s ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source) #source_clickdepth_i -## host of the url +## host of the url (source) #source_host_s +## id of the host (source) +source_host_id_s + ## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source) #source_host_dnc_s @@ -117,8 +123,11 @@ target_name_t ## primary key of document, the URL hash (target) target_id_s -## the url of the document (target) -target_url_s +## the protocol of the url (target) +target_protocol_s + +## the url without the protocol (target) +target_urlstub_s ## the file name extension (target) target_file_ext_s @@ -129,9 +138,6 @@ target_file_ext_s ## number of all characters in the url (target) #target_chars_i -## the protocol of the url (target) -target_protocol_s - ## path of the url (target) #target_path_s @@ -156,6 +162,9 @@ target_path_folders_sxt ## host of the url (target) #target_host_s +## id of the host (target) +target_host_id_s + ## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target) #target_host_dnc_s @@ -168,5 +177,5 @@ target_path_folders_sxt ## the remaining part of the host without organizationdnc (target) #target_host_subdomain_s - - +## flag shows if the target host is equal to the source host +target_inbound_b diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 32af59772..c0d665a3e 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -121,7 +121,8 @@ public class ConfigHeuristics_p { } try { sb.index.fulltext().getDefaultConfiguration().commit(); - } catch (IOException ex) {} + } catch (IOException e) { + } } } diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index 1482991b0..b08cdb19f 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -143,13 +143,12 @@ public class CrawlStartScanner_p if ( post.containsKey("crawl") ) { // make a pk/url mapping final Iterator> se = Scanner.scancacheEntries(); - final Map pkmap = - new TreeMap(Base64Order.enhancedCoder); + final Map pkmap = new TreeMap(Base64Order.enhancedCoder); while (se.hasNext()) { final Scanner.Service u = se.next().getKey(); DigestURI uu; try { - uu = DigestURI.toDigestURI(u.url()); + uu = u.url(); pkmap.put(uu.hash(), uu); } catch ( final MalformedURLException e ) { Log.logException(e); @@ -197,15 +196,14 @@ public class CrawlStartScanner_p String urlString; DigestURI u; try { - final Iterator> se = - Scanner.scancacheEntries(); + final Iterator> se = Scanner.scancacheEntries(); Map.Entry host; while ( se.hasNext() ) { host = se.next(); try { - u = DigestURI.toDigestURI(host.getKey().url()); + u = host.getKey().url(); urlString = u.toNormalform(true); - if ( host.getValue() == Access.granted + if (host.getValue() == Access.granted && Scanner.inIndex(apiCommentCache, urlString) == null ) { String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 229eaddd1..80769580c 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -21,7 +21,7 @@ #%env/templates/submenuCrawlMonitor.template%#

Crawler

-
+
Queues @@ -74,20 +74,24 @@
-
+
Index Size - - + + - + - + + + + + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index fc76cf17b..003958f8b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -36,7 +36,6 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; @@ -73,6 +72,7 @@ public class Crawler_p { final serverObjects prop = new serverObjects(); prop.put("rejected", 0); prop.put("urlpublictextSize", 0); + prop.put("webgraphSize", 0); prop.put("rwipublictextSize", 0); prop.put("list", "0"); prop.put("loaderSize", 0); @@ -277,8 +277,8 @@ public class Crawler_p { try { scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); // get links and generate filter - for (MultiProtocolURI u: scraper.getAnchors().keySet()) { - newRootURLs.add(DigestURI.toDigestURI(u)); + for (DigestURI u: scraper.getAnchors().keySet()) { + newRootURLs.add(u); } } catch (IOException e) { Log.logException(e); @@ -475,7 +475,7 @@ public class Crawler_p { writer.close(); // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); + final Map hyperlinks = scraper.getAnchors(); if (newcrawlingdepth > 0) { if (fullDomain) { newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet()); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 3f9912fbf..1097255e0 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -79,7 +79,7 @@ public class HostBrowser { // set default values prop.put("path", ""); prop.put("result", ""); - prop.putNum("ucount", fulltext.size()); + prop.putNum("ucount", fulltext.collectionSize()); prop.put("hosts", 0); prop.put("files", 0); prop.put("admin", 0); @@ -117,7 +117,7 @@ public class HostBrowser { String load = post.get("load", ""); boolean wait = false; - if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(pathURI.hash())) { + if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) { // in case that the url does not exist and loading is wanted turn this request into a loading request load = path; wait = true; @@ -136,7 +136,7 @@ public class HostBrowser { )); prop.put("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) for (int i = 0; i < 30; i++) { - if (sb.index.exists(url.hash())) break; + if (sb.index.exists(ASCII.String(url.hash()))) break; try {Thread.sleep(100);} catch (InterruptedException e) {} } } catch (MalformedURLException e) { @@ -480,7 +480,7 @@ public class HostBrowser { } // insert constants - prop.putNum("ucount", fulltext.size()); + prop.putNum("ucount", fulltext.collectionSize()); // return rewrite properties return prop; } diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index d81fae130..8b16efa76 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -280,7 +280,7 @@ public class IndexControlRWIs_p { Reference iEntry; while (urlIter.hasNext()) { iEntry = urlIter.next(); - if (!segment.fulltext().exists(iEntry.urlhash())) { + if (!segment.fulltext().exists(ASCII.String(iEntry.urlhash()))) { try { unknownURLEntries.put(iEntry.urlhash()); } catch (final SpaceExceededException e) { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 2fd724c99..6e2b1ec87 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -66,7 +66,7 @@ public class IndexControlURLs_p { prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", ""); - prop.putNum("ucount", segment.fulltext().size()); + prop.putNum("ucount", segment.fulltext().collectionSize()); prop.put("otherHosts", ""); prop.put("genUrlProfile", 0); prop.put("statistics", 1); @@ -312,7 +312,7 @@ public class IndexControlURLs_p { } // insert constants - prop.putNum("ucount", segment.fulltext().size()); + prop.putNum("ucount", segment.fulltext().collectionSize()); // return rewrite properties return prop; } diff --git a/htroot/IndexSchema_p.html b/htroot/IndexSchema_p.html index 81283a04c..64d701fcc 100644 --- a/htroot/IndexSchema_p.html +++ b/htroot/IndexSchema_p.html @@ -16,12 +16,13 @@

If you use a custom Solr schema you may enter a different field name in the column 'Custom Solr Field Name' of the YaCy default attribute name

- Select a Core: + Select a core: +    ... the core can be searched at /solr/select?core=#[core]#&q=*:*&start=0&rows=3
diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index 03e49f9ca..028023d06 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -51,7 +51,7 @@ public class IndexShare_p { prop.put("dtable", ""); prop.put("rtable", ""); prop.putNum("wcount", indexSegment.RWICount()); - prop.putNum("ucount", indexSegment.fulltext().size()); + prop.putNum("ucount", indexSegment.fulltext().collectionSize()); return prop; // be save } @@ -64,7 +64,7 @@ public class IndexShare_p { // insert constants prop.putNum("wcount", indexSegment.RWICount()); - prop.putNum("ucount", indexSegment.fulltext().size()); + prop.putNum("ucount", indexSegment.fulltext().collectionSize()); // return rewrite properties return prop; diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 4937965ac..bddc8a072 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Hit; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; @@ -272,7 +273,7 @@ public class Load_RSS_p { final RSSMessage message = feed.getMessage(entry.getValue().substring(5)); final DigestURI messageurl = new DigestURI(message.getLink()); if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop; - if (sb.urlExists(messageurl.hash()) != null) continue loop; + if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; sb.addToIndex(messageurl, null, null); RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); } catch (final IOException e) { @@ -317,7 +318,7 @@ public class Load_RSS_p { author = item.getAuthor(); if (author == null) author = item.getCopyright(); pubDate = item.getPubDate(); - prop.put("showitems_item_" + i + "_state", sb.urlExists(messageurl.hash()) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); + prop.put("showitems_item_" + i + "_state", sb.urlExists(ASCII.String(messageurl.hash())) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); prop.put("showitems_item_" + i + "_state_count", i); prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); diff --git a/htroot/ServerScannerList.java b/htroot/ServerScannerList.java index 4bd7b531c..4809fe914 100644 --- a/htroot/ServerScannerList.java +++ b/htroot/ServerScannerList.java @@ -63,7 +63,7 @@ public class ServerScannerList { while (se.hasNext()) { host = se.next(); try { - u = DigestURI.toDigestURI(host.getKey().url()); + u = host.getKey().url(); urlString = u.toNormalform(true); prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0); prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash())); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 244a9fddb..da65a4a9d 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -308,7 +308,7 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors()); dark = (i % 2 == 0); - final Map ts = document.getImages(); + final Map ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -432,12 +432,12 @@ public class ViewFile { final serverObjects prop, final String[] wordArray, int c, - final Map media, + final Map media, final String type, boolean dark, - final Map alllinks) { + final Map alllinks) { int i = 0; - for (final Map.Entry entry : media.entrySet()) { + for (final Map.Entry entry : media.entrySet()) { final Properties p = alllinks.get(entry.getKey()); final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index b5c073c3c..566e26b5e 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -127,11 +127,11 @@ public class getpageinfo { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Set uris = scraper.getAnchors().keySet(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final MultiProtocolURI uri: uris) { + for (final DigestURI uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index d3205a4d9..7946a2b70 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -127,11 +127,11 @@ public class getpageinfo_p { prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); // get links and put them into a semicolon-separated list - final Set uris = scraper.getAnchors().keySet(); + final Set uris = scraper.getAnchors().keySet(); final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final MultiProtocolURI uri: uris) { + for (final DigestURI uri: uris) { if (uri == null) continue; links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index cf9529404..07eb120f4 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -76,7 +76,8 @@ public class status_p { prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER)); // index size - prop.putNum("urlpublictextSize", segment.fulltext().size()); + prop.putNum("urlpublictextSize", segment.fulltext().collectionSize()); + prop.putNum("webgraphSize", segment.fulltext().webgraphSize()); prop.putNum("rwipublictextSize", segment.RWICount()); // loader queue diff --git a/htroot/api/status_p.xml b/htroot/api/status_p.xml index 25c62234b..7eec1d761 100644 --- a/htroot/api/status_p.xml +++ b/htroot/api/status_p.xml @@ -21,6 +21,7 @@ #[urlpublictextSize]# + #[webgraphSize]# #[rwipublictextSize]# diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 53f7cbc95..5d834b87e 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -30,7 +30,6 @@ import java.util.Map; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; @@ -111,9 +110,9 @@ public class webstructure { prop.put("references_documents_0_urle", url == null ? 0 : 1); if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true)); int d = 0; - Iterator i = scraper.inboundLinks().iterator(); + Iterator i = scraper.inboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = DigestURI.toDigestURI(i.next()); + DigestURI refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); @@ -122,7 +121,7 @@ public class webstructure { } i = scraper.outboundLinks().iterator(); while (i.hasNext()) { - DigestURI refurl = DigestURI.toDigestURI(i.next()); + DigestURI refurl = i.next(); byte[] refhash = refurl.hash(); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.put("references_documents_0_anchors_" + d + "_hash", refhash); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index ae5020f34..3b6c4b6c4 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -11,8 +11,9 @@
  • File Search
  • Host Browser
  • -
  • Embedded Solr API
  • -
  • Embedded GSA API
  • +
  • Solr Default Core
  • +
  • Solr Webgraph Core
  • +
  • Google Search API
  • Compare Search
  • URL Viewer
  • diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 78cb84ac5..1410a647e 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -169,7 +169,7 @@ public class searchresult { } // get the embedded connector - EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector(); if (connector == null) return null; // do the solr request diff --git a/htroot/js/Crawler.js b/htroot/js/Crawler.js index 208ba6b1e..a2890c26e 100644 --- a/htroot/js/Crawler.js +++ b/htroot/js/Crawler.js @@ -90,7 +90,9 @@ function handleStatus(){ dbsize=getFirstChild(statusTag, "dbsize"); urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); + webgraphSize=getValue(getFirstChild(dbsize, "webgraph")); document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; + document.getElementById("webgraphsize").firstChild.nodeValue=webgraphSize; document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; loaderqueue=getFirstChild(statusTag, "loaderqueue"); diff --git a/htroot/solr/select.java b/htroot/solr/select.java index 1ad96f2da..96ba19905 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -42,6 +42,8 @@ import net.yacy.search.SwitchboardConstants; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.QueryModifier; import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.CollectionSchema; +import net.yacy.search.schema.WebgraphSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -181,7 +183,8 @@ public class select { } // get the embedded connector - EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + boolean defaultConnector = post == null || post.get("core", CollectionSchema.CORE_NAME).equals(CollectionSchema.CORE_NAME); + EmbeddedSolrConnector connector = defaultConnector ? sb.index.fulltext().getDefaultEmbeddedConnector() : sb.index.fulltext().getEmbeddedConnector(WebgraphSchema.CORE_NAME); if (connector == null) return null; // do the solr request, generate facets if we use a special YaCy format diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index 88cd9284e..594d903b1 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -109,7 +109,7 @@ public final class query { if (obj.equals("lurlcount")) { // return the number of all available l-url's - prop.put("response", sb.index.fulltext().size()); + prop.put("response", sb.index.fulltext().collectionSize()); return prop; } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index e54f15ee4..14186eb7a 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -201,7 +201,7 @@ public final class transferRWI { // check if we need to ask for the corresponding URL if (!knownURL.has(urlHash) && !unknownURL.has(urlHash)) try { - if (sb.index.fulltext().exists(urlHash)) { + if (sb.index.fulltext().exists(ASCII.String(urlHash))) { knownURL.put(urlHash); } else { unknownURL.put(urlHash); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index acfa99138..bcaf3da08 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -139,7 +139,7 @@ public final class transferURL { } // doublecheck - if (sb.index.exists(lEntry.hash())) { + if (sb.index.exists(ASCII.String(lEntry.hash()))) { if (Network.log.isFine()) Network.log.logFine("transferURL: double URL '" + lEntry.url() + "' from peer " + otherPeerName); lEntry = null; doublecheck++; diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index c095bc0d8..b925bf0c0 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -175,7 +175,7 @@ public class OpenSearchConnector { if (sb == null) { return false; } - final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector(); + final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector(); // check if needed Solr fields are available (selected) if (connector == null) { Log.logSevere("OpenSearchConnector.Discover", "Error on connecting to embedded Solr index"); diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 72a87c21a..86b113ec4 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -44,6 +45,7 @@ import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.ModifiableSolrParams; @@ -285,4 +287,11 @@ public abstract class AbstractSolrConnector implements SolrConnector { throw new IOException(e.getMessage(), e); } } + + @Override + public void add(final Collection solrdocs) throws IOException, SolrException { + for (SolrInputDocument solrdoc: solrdocs) { + add(solrdoc); + } + } } diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index ffa6f19ae..98c68331a 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -58,6 +58,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo private final SearchHandler requestHandler; private final EmbeddedInstance instance; + private final String coreName; private SolrCore core; public EmbeddedSolrConnector(EmbeddedInstance instance) { @@ -68,6 +69,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo this.requestHandler.init(new NamedList()); this.requestHandler.inform(this.core); super.init(this.instance.getDefaultServer()); + this.coreName = ((EmbeddedSolrServer) this.server).getCoreContainer().getDefaultCoreName(); } public EmbeddedSolrConnector(EmbeddedInstance instance, String coreName) { @@ -78,6 +80,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo this.requestHandler.init(new NamedList()); this.requestHandler.inform(this.core); super.init(this.instance.getServer(coreName)); + this.coreName = coreName; } public SolrInstance getInstance() { @@ -104,9 +107,8 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo Thread.currentThread().setName("solr query: size"); EmbeddedSolrServer ess = (EmbeddedSolrServer) this.server; CoreContainer coreContainer = ess.getCoreContainer(); - String coreName = coreContainer.getDefaultCoreName(); - SolrCore core = coreContainer.getCore(coreName); - if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + coreName); + SolrCore core = coreContainer.getCore(this.coreName); + if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + this.coreName); try { SolrParams params = AbstractSolrConnector.catchSuccessQuery; diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 38a26431c..81a01afe4 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -21,6 +21,7 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.BlockingQueue; @@ -74,8 +75,7 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * delete entries from solr according the given solr query string - * @param id the url hash of the entry - * @return the number of deletions + * @param querystring * @throws IOException */ public void deleteByQuery(final String querystring) throws IOException; @@ -96,6 +96,15 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws SolrException */ public void add(final SolrInputDocument solrdoc) throws IOException, SolrException; + + /** + * add a collection of solr input documents + * @param solrdocs + * @throws IOException + * @throws SolrException + */ + public void add(final Collection solrdoc) throws IOException, SolrException; + /** * get a field value from solr by given key for the id-field and a field name * @param key diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 537ca8b1b..dd6210540 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -22,6 +22,8 @@ package net.yacy.cora.federate.solr.connector; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; import java.util.List; import net.yacy.kelondro.logging.Log; @@ -197,4 +199,32 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen } } + @Override + public void add(final Collection solrdocs) throws IOException, SolrException { + if (this.server == null) return; + try { + for (SolrInputDocument solrdoc : solrdocs) { + if (solrdoc.containsKey("_version_")) solrdoc.setField("_version_",0L); // prevent Solr "version conflict" + } + synchronized (this.server) { + this.server.add(solrdocs, -1); + } + } catch (Throwable e) { + // catches "version conflict for": try this again and delete the document in advance + List ids = new ArrayList(); + for (SolrInputDocument solrdoc : solrdocs) ids.add((String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + try { + this.server.deleteById(ids); + } catch (SolrServerException e1) {} + try { + synchronized (this.server) { + this.server.add(solrdocs, -1); + } + } catch (Throwable ee) { + log.warn(e.getMessage() + " IDs=" + ids.toString()); + throw new IOException(ee); + } + } + } + } diff --git a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java index 46d804e0e..b7c76a076 100644 --- a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java +++ b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java @@ -21,8 +21,8 @@ package net.yacy.cora.federate.solr.instance; import java.util.Collection; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.federate.solr.connector.CachedSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; @@ -36,12 +36,16 @@ public class InstanceMirror { private ShardInstance solr1; private CachedSolrConnector defaultConnector; private Map connectorCache; + private EmbeddedSolrConnector defaultEmbeddedConnector; + private Map embeddedCache; public InstanceMirror() { this.solr0 = null; this.solr1 = null; this.defaultConnector = null; - this.connectorCache = new HashMap(); + this.connectorCache = new ConcurrentHashMap(); + this.defaultEmbeddedConnector = null; + this.embeddedCache = new ConcurrentHashMap(); } public boolean isConnected0() { @@ -50,8 +54,10 @@ public class InstanceMirror { public void connect0(EmbeddedInstance c) { for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr0 = c; } @@ -62,8 +68,10 @@ public class InstanceMirror { public void disconnect0() { if (this.solr0 == null) return; for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr0.close(); this.solr0 = null; } @@ -74,8 +82,10 @@ public class InstanceMirror { public void connect1(ShardInstance c) { for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr1 = c; } @@ -86,8 +96,10 @@ public class InstanceMirror { public void disconnect1() { if (this.solr1 == null) return; for (SolrConnector connector: connectorCache.values()) connector.close(); - this.connectorCache.clear(); this.defaultConnector = null; + this.connectorCache.clear(); + this.defaultEmbeddedConnector = null; + this.embeddedCache.clear(); this.solr1.close(); this.solr1 = null; } @@ -108,8 +120,23 @@ public class InstanceMirror { if (this.solr1 != null) return this.solr1.getCoreNames(); return null; } + + public EmbeddedSolrConnector getDefaultEmbeddedConnector() { + if (this.defaultEmbeddedConnector != null) return this.defaultEmbeddedConnector; + this.defaultEmbeddedConnector = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0); + this.embeddedCache.put(this.getDefaultCoreName(), this.defaultEmbeddedConnector); + return this.defaultEmbeddedConnector; + } + + public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + EmbeddedSolrConnector ec = this.embeddedCache.get(corename); + if (ec != null) return ec; + ec = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename); + this.embeddedCache.put(corename, ec); + return ec; + } - public SolrConnector getDefaultConnector() { + public SolrConnector getDefaultMirrorConnector() { if (this.defaultConnector != null) return this.defaultConnector; String defaultCoreName = this.getDefaultCoreName(); if (defaultCoreName == null) return null; @@ -120,7 +147,7 @@ public class InstanceMirror { return this.defaultConnector; } - public SolrConnector getConnector(String corename) { + public SolrConnector getMirrorConnector(String corename) { CachedSolrConnector msc = this.connectorCache.get(corename); if (msc != null) return msc; EmbeddedSolrConnector esc = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename); diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index cbbf3a95d..e3453fcb2 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -40,10 +40,10 @@ import java.util.Vector; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.NumberTools; +import net.yacy.kelondro.data.meta.DigestURI; /** @@ -560,7 +560,7 @@ public class HeaderFramework extends TreeMap implements Map conProp) throws MalformedURLException { + public static DigestURI getRequestURL(final HashMap conProp) throws MalformedURLException { String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given @@ -574,7 +574,7 @@ public class HeaderFramework extends TreeMap implements Map hyperlinks) { + public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks) { new Thread() { @Override public void run() { @@ -201,12 +201,12 @@ public final class CrawlStacker { }.start(); } - private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { - for (final Map.Entry e: hyperlinks.entrySet()) { + private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map hyperlinks, final boolean replace) { + for (final Map.Entry e: hyperlinks.entrySet()) { if (e.getKey() == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) - final DigestURI url = DigestURI.toDigestURI(e.getKey()); + final DigestURI url = e.getKey(); final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); diff --git a/source/net/yacy/crawler/data/ResultImages.java b/source/net/yacy/crawler/data/ResultImages.java index e2ba6acf4..62d76ce5d 100644 --- a/source/net/yacy/crawler/data/ResultImages.java +++ b/source/net/yacy/crawler/data/ResultImages.java @@ -61,7 +61,7 @@ public class ResultImages { if (MemoryControl.shortStatus()) clearQueues(); limitQueues(1000); - final Map images = document.getImages(); + final Map images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup if (image == null || image.url() == null) continue; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 9c9d73ec1..b50a78dc7 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -152,7 +151,7 @@ public final class HTTPLoader { } // normalize URL - final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); // restart crawling with new url this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); @@ -172,7 +171,7 @@ public final class HTTPLoader { } // check if the url was already indexed - final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash()); + final HarvestProcess dbname = this.sb.urlExists(ASCII.String(redirectionUrl.hash())); if (dbname != null) { // customer request this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode); throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString()); @@ -293,7 +292,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); // if we are already doing a shutdown we don't need to retry crawling diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 3886fffae..265ccced9 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; @@ -90,7 +91,7 @@ public class RSSLoader extends Thread { try { final DigestURI messageurl = new DigestURI(message.getLink()); if (indexTriggered.containsKey(messageurl.hash())) continue loop; - if (sb.urlExists(messageurl.hash()) != null) continue loop; + if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; sb.addToIndex(messageurl, null, null); indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); loadCount++; diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index ddbe84045..9f31560e0 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -82,7 +82,7 @@ public class SitemapImporter extends Thread { // check if the url is known and needs to be recrawled Date lastMod = entry.lastmod(null); if (lastMod != null) { - final HarvestProcess dbocc = this.sb.urlExists(nexturlhash); + final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); if (dbocc != null && dbocc == HarvestProcess.LOADED) { // the url was already loaded. we need to check the date final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index ac22158de..ade6df6db 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -48,7 +48,6 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.BookmarksDB.Tag; @@ -134,9 +133,9 @@ public class BookmarkHelper { int importCount = 0; - Map links = new HashMap(); + Map links = new HashMap(); String title; - MultiProtocolURI url; + DigestURI url; Bookmark bm; final Set tags=ListManager.string2set(tag); //this allow multiple default tags try { @@ -148,14 +147,14 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (final Entry link: links.entrySet()) { + for (final Entry link: links.entrySet()) { url = link.getKey(); title = link.getValue().getProperty("name", ""); Log.logInfo("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed title = url.toString(); } - bm = db.new Bookmark(DigestURI.toDigestURI(url)); + bm = db.new Bookmark(url); bm.setProperty(Bookmark.BOOKMARK_TITLE, title); bm.setTags(tags); bm.setPublic(importPublic); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 39e184024..ddc1279ce 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -50,6 +50,7 @@ import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.logging.Log; @@ -113,7 +114,7 @@ public final class Condenser { // add the URL components to the word list insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getTextString(), meaningLib, doAutotagging); // the phrase counter: @@ -163,7 +164,7 @@ public final class Condenser { if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index ca7a7df43..70fea62b6 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -68,7 +68,7 @@ import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURI source; // the source url + private final DigestURI source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field @@ -78,13 +78,14 @@ public class Document { private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) - private final Map rss; // all embedded rss feeds - private final Map images; // all visible pictures in document + private final Map anchors; // all links embedded as clickeable entities (anchor tags) + private final Map rss; // all embedded rss feeds + private final Map images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks, inboundlinks, outboundlinks; + private Map audiolinks, videolinks, applinks, hyperlinks; + private Map inboundlinks, outboundlinks; private Map emaillinks; private MultiProtocolURI favicon; private boolean resorted; @@ -103,9 +104,9 @@ public class Document { final String[] sections, final String abstrct, final double lon, final double lat, final Object text, - final Map anchors, - final Map rss, - final Map images, + final Map anchors, + final Map rss, + final Map images, final boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; @@ -120,9 +121,9 @@ public class Document { this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.lon = lon; this.lat = lat; - this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.rss = (rss == null) ? new HashMap(0) : rss; - this.images = (images == null) ? new HashMap() : images; + this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.rss = (rss == null) ? new HashMap(0) : rss; + this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; @@ -397,13 +398,13 @@ dc_rights return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.anchors; } - public Map getRSS() { + public Map getRSS() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.rss; @@ -412,30 +413,30 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); return this.hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!this.resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!this.resorted) resortLinks(); return this.videolinks; } - public Map getImages() { + public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!this.resorted) resortLinks(); return this.images; } - public Map getApplinks() { + public Map getApplinks() { if (!this.resorted) resortLinks(); return this.applinks; } @@ -459,23 +460,23 @@ dc_rights synchronized (this) { if (this.resorted) return; // extract hyperlinks, medialinks and emaillinks from anchorlinks - MultiProtocolURI url; + DigestURI url; String u; int extpos, qpos; String ext = null; final String thishost = this.source.getHost(); - this.inboundlinks = new HashMap(); - this.outboundlinks = new HashMap(); - this.hyperlinks = new HashMap(); - this.videolinks = new HashMap(); - this.audiolinks = new HashMap(); - this.applinks = new HashMap(); + this.inboundlinks = new HashMap(); + this.outboundlinks = new HashMap(); + this.hyperlinks = new HashMap(); + this.videolinks = new HashMap(); + this.audiolinks = new HashMap(); + this.applinks = new HashMap(); this.emaillinks = new HashMap(); - final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - for (final Map.Entry entry: collectedImages.entrySet()) { + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + for (final Map.Entry entry: collectedImages.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } - for (final Map.Entry entry: this.anchors.entrySet()) { + for (final Map.Entry entry: this.anchors.entrySet()) { url = entry.getKey(); if (url == null) continue; final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; @@ -585,23 +586,23 @@ dc_rights return v; } - private static Map allReflinks(final Collection links) { + private static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final Map v = new HashMap(); + final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; - MultiProtocolURI url = null; + DigestURI url = null; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof MultiProtocolURI) - url = (MultiProtocolURI) o; + if (o instanceof DigestURI) + url = (DigestURI) o; else if (o instanceof String) - url = new MultiProtocolURI((String) o); + url = new DigestURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { @@ -615,7 +616,7 @@ dc_rights u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); - url = new MultiProtocolURI(u); + url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -625,7 +626,7 @@ dc_rights u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); - url = new MultiProtocolURI(u); + url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -689,12 +690,12 @@ dc_rights return c; } - public Set inboundLinks() { + public Set inboundLinks() { if (this.inboundlinks == null) resortLinks(); return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); } - public Set outboundLinks() { + public Set outboundLinks() { if (this.outboundlinks == null) resortLinks(); return (this.outboundlinks == null) ? null : this.outboundlinks.keySet(); } @@ -764,9 +765,7 @@ dc_rights * @param docs * @return */ - public static Document mergeDocuments(final DigestURI location, - final String globalMime, final Document[] docs) - { + public static Document mergeDocuments(final DigestURI location, final String globalMime, final Document[] docs) { if (docs == null || docs.length == 0) return null; if (docs.length == 1) return docs[0]; @@ -778,9 +777,9 @@ dc_rights final StringBuilder description = new StringBuilder(80); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); - final Map anchors = new HashMap(); - final Map rss = new HashMap(); - final Map images = new HashMap(); + final Map anchors = new HashMap(); + final Map rss = new HashMap(); + final Map images = new HashMap(); double lon = 0.0d, lat = 0.0d; for (final Document doc: docs) { @@ -856,22 +855,22 @@ dc_rights false); } - public static Map getHyperlinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getHyperlinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { result.putAll(d.getHyperlinks()); final Object parser = d.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; String refresh = html.getRefreshPath(); - if (refresh != null && refresh.length() > 0)try {result.put(new MultiProtocolURI(refresh), "refresh");} catch (MalformedURLException e) {} + if (refresh != null && refresh.length() > 0)try {result.put(new DigestURI(refresh), "refresh");} catch (MalformedURLException e) {} } } return result; } - public static Map getImagelinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getImagelinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { for (final ImageEntry imageReference : d.getImages().values()) { // construct a image name which contains the document title to enhance the search process for images @@ -881,30 +880,30 @@ dc_rights return result; } - public static Map getAudiolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getAudiolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.audiolinks.entrySet()) { + for (Map.Entry e: d.audiolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getVideolinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getVideolinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.videolinks.entrySet()) { + for (Map.Entry e: d.videolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } - public static Map getApplinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getApplinks(final Document[] documents) { + final Map result = new HashMap(); for (final Document d: documents) { - for (Map.Entry e: d.applinks.entrySet()) { + for (Map.Entry e: d.applinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 502e63da3..55fbe8a75 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -53,6 +53,7 @@ import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -121,11 +122,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links - private final Map anchors; - private final Map rss, css; - private final Set script, frames, iframes; - private final Map embeds; // urlhash/embed relation - private final Map images; // urlhash/image relation + private final Map anchors; + private final Map rss, css; + private final Set script, frames, iframes; + private final Map embeds; // urlhash/embed relation + private final Map images; // urlhash/image relation private final Map metas; private LinkedHashSet titles; //private String headline; @@ -135,7 +136,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; - private MultiProtocolURI canonical; + private DigestURI canonical; private final int maxLinks; private int breadcrumbs; @@ -148,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { /** * The document root {@link MultiProtocolURI} */ - private MultiProtocolURI root; + private DigestURI root; /** * evaluation scores: count appearance of specific attributes @@ -156,7 +157,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Evaluation evaluationScores; @SuppressWarnings("unchecked") - public ContentScraper(final MultiProtocolURI root, int maxLinks) { + public ContentScraper(final DigestURI root, int maxLinks) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); @@ -164,15 +165,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.evaluationScores = new Evaluation(); - this.rss = new SizeLimitedMap(maxLinks); - this.css = new SizeLimitedMap(maxLinks); - this.anchors = new SizeLimitedMap(maxLinks); - this.images = new SizeLimitedMap(maxLinks); - this.embeds = new SizeLimitedMap(maxLinks); - this.frames = new SizeLimitedSet(maxLinks); - this.iframes = new SizeLimitedSet(maxLinks); + this.rss = new SizeLimitedMap(maxLinks); + this.css = new SizeLimitedMap(maxLinks); + this.anchors = new SizeLimitedMap(maxLinks); + this.images = new SizeLimitedMap(maxLinks); + this.embeds = new SizeLimitedMap(maxLinks); + this.frames = new SizeLimitedSet(maxLinks); + this.iframes = new SizeLimitedSet(maxLinks); this.metas = new SizeLimitedMap(maxLinks); - this.script = new SizeLimitedSet(maxLinks); + this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); this.headlines = new ArrayList[6]; for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); @@ -194,7 +195,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.content.trimToSize(); } - private void mergeAnchors(final MultiProtocolURI url, final Properties p) { + private void mergeAnchors(final DigestURI url, final Properties p) { final Properties p0 = this.anchors.get(url); if (p0 == null) { this.anchors.put(url, p); @@ -282,7 +283,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // find http links inside text s = 0; String u; - MultiProtocolURI url; + DigestURI url; while (s < b.length()) { p = find(b, dpssp, s); if (p == Integer.MAX_VALUE) break; @@ -294,7 +295,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 6; try { - url = new MultiProtocolURI(u); + url = new DigestURI(u); mergeAnchors(url, new Properties()); continue; } catch (final MalformedURLException e) {} @@ -317,9 +318,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { return (p < 0) ? Integer.MAX_VALUE : p; } - private MultiProtocolURI absolutePath(final String relativePath) { + private DigestURI absolutePath(final String relativePath) { try { - return MultiProtocolURI.newURL(this.root, relativePath); + return DigestURI.newURL(this.root, relativePath); } catch (final Exception e) { return null; } @@ -331,7 +332,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final MultiProtocolURI url = absolutePath(src); + final DigestURI url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); @@ -343,10 +344,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.imgpath, src); } else if(tagname.equalsIgnoreCase("base")) { try { - this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING)); + this.root = new DigestURI(tagopts.getProperty("href", EMPTY_STRING)); } catch (final MalformedURLException e) {} } else if (tagname.equalsIgnoreCase("frame")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); mergeAnchors(src, tagopts /* with property "name" */); this.frames.add(src); @@ -384,13 +385,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String href = tagopts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { tagopts.put("nme", areatitle); - MultiProtocolURI url = absolutePath(href); + DigestURI url = absolutePath(href); tagopts.put("href", url.toNormalform(true)); mergeAnchors(url, tagopts); } } else if (tagname.equalsIgnoreCase("link")) { final String href = tagopts.getProperty("href", EMPTY_STRING); - final MultiProtocolURI newLink = absolutePath(href); + final DigestURI newLink = absolutePath(href); if (newLink != null) { tagopts.put("href", newLink.toNormalform(true)); @@ -420,7 +421,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String src = tagopts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { - final MultiProtocolURI url = absolutePath(src); + final DigestURI url = absolutePath(src); if (url != null) { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); @@ -434,12 +435,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if(tagname.equalsIgnoreCase("param")) { final String name = tagopts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); + DigestURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); tagopts.put("value", url.toNormalform(true)); mergeAnchors(url, tagopts /* with property "name" */); } } else if (tagname.equalsIgnoreCase("iframe")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); + final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); tagopts.put("src", src.toNormalform(true)); mergeAnchors(src, tagopts /* with property "name" */); this.iframes.add(src); @@ -459,7 +460,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", EMPTY_STRING); - MultiProtocolURI url; + DigestURI url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String f = url.getFileName(); final int p = f.lastIndexOf('.'); @@ -552,7 +553,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } catch (IOException e) { } } - for (final Map.Entry entry: scraper.getAnchors().entrySet()) { + for (final Map.Entry entry: scraper.getAnchors().entrySet()) { mergeAnchors(entry.getKey(), entry.getValue()); } this.images.putAll(scraper.images); @@ -640,15 +641,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } - public MultiProtocolURI[] getFlash() { + public DigestURI[] getFlash() { String ext; - ArrayList f = new ArrayList(); - for (final MultiProtocolURI url: this.anchors.keySet()) { + ArrayList f = new ArrayList(); + for (final DigestURI url: this.anchors.keySet()) { ext = url.getFileExtension(); if (ext == null) continue; if (ext.equals("swf")) f.add(url); } - return f.toArray(new MultiProtocolURI[f.size()]); + return f.toArray(new DigestURI[f.size()]); } public boolean containsFlash() { @@ -674,36 +675,36 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public Map getAnchors() { + public Map getAnchors() { // returns a url (String) / name (String) relation return this.anchors; } - public Map getRSS() { + public Map getRSS() { // returns a url (String) / name (String) relation return this.rss; } - public Map getCSS() { + public Map getCSS() { // returns a url (String) / name (String) relation return this.css; } - public Set getFrames() { + public Set getFrames() { // returns a url (String) / name (String) relation return this.frames; } - public Set getIFrames() { + public Set getIFrames() { // returns a url (String) / name (String) relation return this.iframes; } - public Set getScript() { + public Set getScript() { return this.script; } - public MultiProtocolURI getCanonical() { + public DigestURI getCanonical() { return this.canonical; } @@ -711,11 +712,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { * get all images * @return a map of */ - public Map getImages() { + public Map getImages() { return this.images; } - public Map getEmbeds() { + public Map getEmbeds() { return this.embeds; } @@ -970,29 +971,29 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false, maxLinks); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURI("http://localhost"),null,false, maxLinks); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); htmlFilter.close(); if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content - final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"), maxLinks); + final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost"), maxLinks); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); writer.close(); return scraper; } - public static void addAllImages(final Map a, final Map b) { - final Iterator> i = b.entrySet().iterator(); - Map.Entry ie; + public static void addAllImages(final Map a, final Map b) { + final Iterator> i = b.entrySet().iterator(); + Map.Entry ie; while (i.hasNext()) { ie = i.next(); addImage(a, ie.getValue()); } } - public static void addImage(final Map a, final ImageEntry ie) { + public static void addImage(final Map a, final ImageEntry ie) { if (a.containsKey(ie.url())) { // in case of a collision, take that image that has the better image size tags if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); diff --git a/source/net/yacy/document/parser/html/EmbedEntry.java b/source/net/yacy/document/parser/html/EmbedEntry.java index f620f507f..cbaaffb2a 100644 --- a/source/net/yacy/document/parser/html/EmbedEntry.java +++ b/source/net/yacy/document/parser/html/EmbedEntry.java @@ -20,15 +20,15 @@ package net.yacy.document.parser.html; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.data.meta.DigestURI; public class EmbedEntry { - private final MultiProtocolURI url; + private final DigestURI url; private final int width, height; private final String type, pluginspage; - public EmbedEntry(final MultiProtocolURI url, int width, int height, String type, String pluginspage) { + public EmbedEntry(final DigestURI url, int width, int height, String type, String pluginspage) { this.url = url; this.width = width; this.height = height; @@ -36,7 +36,7 @@ public class EmbedEntry { this.pluginspage = pluginspage; } - public MultiProtocolURI getUrl() { + public DigestURI getUrl() { return this.url; } diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index e795fa5f5..37419fffc 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -26,16 +26,16 @@ package net.yacy.document.parser.html; import java.util.Comparator; -import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.data.meta.DigestURI; public class ImageEntry implements Comparable, Comparator { - private final MultiProtocolURI url; + private final DigestURI url; private final String alt; private final int width, height; private final long fileSize; - public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) { + public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) { assert url != null; this.url = url; this.alt = alt; @@ -44,7 +44,7 @@ public class ImageEntry implements Comparable, Comparator languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); + final HashMap anchors = new HashMap(); + final HashMap images = new HashMap(); // add this image to the map of images final String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); @@ -223,7 +223,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final MultiProtocolURI location, + final DigestURI location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; try { @@ -238,7 +238,7 @@ public class genericImageParser extends AbstractParser implements Parser { } public static ImageInfo parseJavaImage( - final MultiProtocolURI location, + final DigestURI location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); ii.image = image; @@ -275,12 +275,12 @@ public class genericImageParser extends AbstractParser implements Parser { } public static class ImageInfo { - public MultiProtocolURI location; + public DigestURI location; public BufferedImage image; public StringBuilder info; public int height; public int width; - public ImageInfo(final MultiProtocolURI location) { + public ImageInfo(final DigestURI location) { this.location = location; this.image = null; this.info = new StringBuilder(); diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 4917f0fdd..8c471e388 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -37,7 +37,6 @@ import java.util.Properties; import java.util.Set; import net.yacy.cora.document.Hit; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSReader; import net.yacy.document.AbstractParser; @@ -75,13 +74,13 @@ public class rssParser extends AbstractParser implements Parser { final List docs = new ArrayList(); DigestURI uri; Set languages; - Map anchors; + Map anchors; Document doc; for (final Hit item: feed) try { uri = new DigestURI(item.getLink()); languages = new HashSet(); languages.add(item.getLanguage()); - anchors = new HashMap(); + anchors = new HashMap(); Properties p = new Properties(); p.put("name", item.getTitle()); anchors.put(uri, p); @@ -102,7 +101,7 @@ public class rssParser extends AbstractParser implements Parser { null, anchors, null, - new HashMap(), + new HashMap(), false); docs.add(doc); } catch (MalformedURLException e) { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index b4812211a..6e9204fa6 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -33,7 +33,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -168,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser { Document[] theDocs; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath)); + final DigestURI url = DigestURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 359766e0d..238fed1f4 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -40,7 +40,6 @@ import java.util.zip.GZIPInputStream; import javax.xml.parsers.DocumentBuilderFactory; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -96,7 +95,7 @@ public class sitemapParser extends AbstractParser implements Parser { null, null, null, - new HashMap(), + new HashMap(), false); docs.add(doc); } catch (MalformedURLException e) { diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 58d80e399..2f974aa2d 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -33,7 +33,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Properties; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -79,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser { final String[] sections = null; final String abstrct = null; //TreeSet images = null; - final Map anchors = new HashMap(); + final Map anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; @@ -98,7 +97,7 @@ public class swfParser extends AbstractParser implements Parser { urlnr = Integer.toString(++urls).toString(); final Properties p = new Properties(); p.put("name", urlnr); - anchors.put(new MultiProtocolURI(url), p); + anchors.put(new DigestURI(url), p); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 3e098d5c5..d2507cf15 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -33,7 +33,6 @@ import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -90,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp); + subDocs = TextParser.parseSource(DigestURI.newURL(url, "#" + name), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 8304a2e81..8ffe75658 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -37,7 +37,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Properties; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.document.AbstractParser; @@ -70,7 +69,7 @@ public class vcfParser extends AbstractParser implements Parser { final StringBuilder parsedTitle = new StringBuilder(); final StringBuilder parsedDataText = new StringBuilder(); final HashMap parsedData = new HashMap(); - final HashMap anchors = new HashMap(); + final HashMap anchors = new HashMap(); final LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; @@ -177,7 +176,7 @@ public class vcfParser extends AbstractParser implements Parser { parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { - final MultiProtocolURI newURL = new MultiProtocolURI(value); + final DigestURI newURL = new DigestURI(value); final Properties p = new Properties(); p.put("name", newURL.toString()); anchors.put(newURL, p); diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index a124bd946..dc1346140 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -32,7 +32,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -87,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); - final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name)); + final DigestURI virtualURL = DigestURI.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); docs = TextParser.parseSource(virtualURL, mime, null, tmp); if (docs == null) continue; diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index 190ef46d7..a6b9dba11 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -138,15 +138,12 @@ public class DigestURI extends MultiProtocolURI implements Serializable { * DigestURI from general URI * @param u */ + /* private DigestURI(final MultiProtocolURI u) { super(u); this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null; } - - - public static DigestURI toDigestURI(MultiProtocolURI u) { - return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u); - } + */ /** * DigestURI from general URI, hash already calculated @@ -168,6 +165,23 @@ public class DigestURI extends MultiProtocolURI implements Serializable { this.hash = null; } + public static DigestURI newURL(final DigestURI baseURL, String relPath) throws MalformedURLException { + if (relPath.startsWith("//")) { + // patch for urls starting with "//" which can be found in the wild + relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath; + } + if ((baseURL == null) || + isHTTP(relPath) || + isHTTPS(relPath) || + isFTP(relPath) || + isFile(relPath) || + isSMB(relPath)/*|| + relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { + return new DigestURI(relPath); + } + return new DigestURI(baseURL, relPath); + } + private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful @Override diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index e30bcaaa9..97633606c 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -169,7 +169,7 @@ public class Transmission { notFoundx.add(e.urlhash()); continue; } - if (!Transmission.this.segment.fulltext().exists(e.urlhash())) { + if (!Transmission.this.segment.fulltext().exists(ASCII.String(e.urlhash()))) { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } else { diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 2d1427d00..5e0cbd555 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -46,7 +46,6 @@ import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.sorting.ClusteredScoreMap; @@ -83,9 +82,9 @@ public class WebStructureGraph { private static class LearnObject { private final DigestURI url; - private final Set globalRefURLs; + private final Set globalRefURLs; - private LearnObject(final DigestURI url, final Set globalRefURLs) { + private LearnObject(final DigestURI url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } @@ -160,11 +159,11 @@ public class WebStructureGraph { public void generateCitationReference(final DigestURI url, final Document document) { // generate citation reference - final Map hl = document.getHyperlinks(); - final Iterator it = hl.keySet().iterator(); - final HashSet globalRefURLs = new HashSet(); + final Map hl = document.getHyperlinks(); + final Iterator it = hl.keySet().iterator(); + final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); - MultiProtocolURI u; + DigestURI u; int maxref = 1000; while ( it.hasNext() && maxref-- > 0 ) { u = it.next(); @@ -191,7 +190,7 @@ public class WebStructureGraph { } public void generateCitationReference(final DigestURI from, final DigestURI to) { - final HashSet globalRefURLs = new HashSet(); + final HashSet globalRefURLs = new HashSet(); final String refhost = from.getHost(); if (refhost != null && to.getHost() != null && !to.getHost().equals(refhost)) globalRefURLs.add(to); final LearnObject lro = new LearnObject(from, globalRefURLs); @@ -586,12 +585,10 @@ public class WebStructureGraph { private void learnrefs(final LearnObject lro) { final Set refhosts = new HashSet(); - DigestURI du; String hosthash; - for ( final MultiProtocolURI u : lro.globalRefURLs ) { + for ( final DigestURI u : lro.globalRefURLs ) { if (Switchboard.getSwitchboard().shallTerminate()) break; - du = DigestURI.toDigestURI(u); - hosthash = ASCII.String(du.hash(), 6, 6); + hosthash = ASCII.String(u.hash(), 6, 6); if (!exists(hosthash)) { // this must be recorded as an host with no references synchronized ( this.structure_new ) { diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index ae5a3e302..742e8b243 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -245,10 +245,10 @@ public final class yacyRelease extends yacyVersion { } // analyze links in scraper resource, and find link to latest release in it - final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); - for (final MultiProtocolURI url : anchors.keySet()) { + for (final DigestURI url : anchors.keySet()) { try { final yacyRelease release = new yacyRelease(url, location.getPublicKey()); //System.out.println("r " + release.toAnchor()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 4f8102f23..a65c4dae4 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -39,7 +39,6 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; @@ -381,7 +380,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { + public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 575889858..8ed22305f 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1545,12 +1545,12 @@ public final class Switchboard extends serverSwitch { return false; } - public HarvestProcess urlExists(final byte[] hash) { + public HarvestProcess urlExists(final String hash) { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned if (this.index.exists(hash)) return HarvestProcess.LOADED; - return this.crawlQueues.urlExists(hash); + return this.crawlQueues.urlExists(ASCII.getBytes(hash)); } public void urlRemove(final Segment segment, final byte[] hash) { @@ -2494,7 +2494,7 @@ public final class Switchboard extends serverSwitch { ) ) { // get the hyperlinks - final Map hl = Document.getHyperlinks(documents); + final Map hl = Document.getHyperlinks(documents); // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { @@ -2506,7 +2506,7 @@ public final class Switchboard extends serverSwitch { // insert those hyperlinks to the crawler MultiProtocolURI nextUrl; - for ( final Map.Entry nextEntry : hl.entrySet() ) { + for ( final Map.Entry nextEntry : hl.entrySet() ) { // check for interruption checkInterruption(); @@ -2654,7 +2654,7 @@ public final class Switchboard extends serverSwitch { // CREATE INDEX final String dc_title = document.dc_title(); - final DigestURI url = DigestURI.toDigestURI(document.dc_source()); + final DigestURI url = document.dc_source(); final DigestURI referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); @@ -2711,14 +2711,14 @@ public final class Switchboard extends serverSwitch { feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url(), ASCII.String(queueEntry.url().hash()))); // store rss feeds in document into rss table - for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { + for ( final Map.Entry rssEntry : document.getRSS().entrySet() ) { final Tables.Data rssRow = new Tables.Data(); rssRow.put("referrer", url.hash()); rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true))); rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("recording_date", new Date()); try { - this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow); + this.tables.update("rss", rssEntry.getKey().hash(), rssRow); } catch ( final IOException e ) { Log.logException(e); } @@ -2760,7 +2760,7 @@ public final class Switchboard extends serverSwitch { public final void addAllToIndex( final DigestURI url, - final Map links, + final Map links, final SearchEvent searchEvent, final String heuristicName) { @@ -2775,10 +2775,10 @@ public final class Switchboard extends serverSwitch { } // check if some of the links match with the query - final Map matcher = searchEvent.query.separateMatches(links); + final Map matcher = searchEvent.query.separateMatches(links); // take the matcher and load them all - for ( final Map.Entry entry : matcher.entrySet() ) { + for ( final Map.Entry entry : matcher.entrySet() ) { try { addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); } catch ( final IOException e ) { @@ -2787,7 +2787,7 @@ public final class Switchboard extends serverSwitch { } // take then the no-matcher and load them also - for ( final Map.Entry entry : links.entrySet() ) { + for ( final Map.Entry entry : links.entrySet() ) { try { addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); } catch ( final IOException e ) { @@ -2926,10 +2926,10 @@ public final class Switchboard extends serverSwitch { public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, Parser.Failure { - if ( searchEvent != null ) { + if (searchEvent != null) { searchEvent.addHeuristic(url.hash(), heuristicName, true); } - if ( this.index.exists(url.hash()) ) { + if (this.index.exists(ASCII.String(url.hash()))) { return; // don't do double-work } final Request request = this.loader.request(url, true, true); @@ -3004,7 +3004,7 @@ public final class Switchboard extends serverSwitch { */ public void addToCrawler(final DigestURI url, final boolean asglobal) { - if ( this.index.exists(url.hash()) ) { + if (this.index.exists(ASCII.String(url.hash()))) { return; // don't do double-work } final Request request = this.loader.request(url, true, true); @@ -3204,7 +3204,7 @@ public final class Switchboard extends serverSwitch { return "no DHT distribution: not enabled (per setting)"; } final Segment indexSegment = this.index; - int size = indexSegment.fulltext().size(); + long size = indexSegment.fulltext().collectionSize(); if ( size < 10 ) { return "no DHT distribution: loadedURL.size() = " + size; } @@ -3348,12 +3348,12 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; searchEvent.rankingProcess.oneFeederStarted(); try { links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if ( links != null ) { - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { if ( !i.next().getHost().endsWith(host) ) { i.remove(); @@ -3387,16 +3387,16 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; DigestURI url; try { links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false); while (i.hasNext()) { - url = DigestURI.toDigestURI(i.next()); + url = i.next(); boolean islocal = url.getHost().contentEquals(startUrl.getHost()); // add all external links or links to different page to crawler if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) { @@ -3458,11 +3458,11 @@ public final class Switchboard extends serverSwitch { //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { - final Map links = new TreeMap(); - MultiProtocolURI uri; + final Map links = new TreeMap(); + DigestURI uri; for ( final RSSMessage message : rss.getFeed() ) { try { - uri = new MultiProtocolURI(message.getLink()); + uri = new DigestURI(message.getLink()); links.put(uri, message.getTitle()); } catch ( final MalformedURLException e ) { } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 65342c7f3..ebcd82526 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.PrintWriter; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -195,9 +196,12 @@ public final class Fulltext { this.solrInstances.disconnect1(); } - public EmbeddedSolrConnector getDefaultLocalSolrConnector() { - if (this.solrInstances.getSolr0() == null) return null; - return new EmbeddedSolrConnector(this.solrInstances.getSolr0()); + public EmbeddedSolrConnector getDefaultEmbeddedConnector() { + return this.solrInstances.getDefaultEmbeddedConnector(); + } + + public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + return this.solrInstances.getEmbeddedConnector(corename); } public RemoteSolrConnector getDefaultRemoteSolrConnector() { @@ -210,11 +214,11 @@ public final class Fulltext { } public SolrConnector getDefaultConnector() { - return this.solrInstances.getDefaultConnector(); + return this.solrInstances.getDefaultMirrorConnector(); } public SolrConnector getWebgraphConnector() { - return this.solrInstances.getConnector(WebgraphSchema.CORE_NAME); + return this.solrInstances.getMirrorConnector(WebgraphSchema.CORE_NAME); } public void clearCache() { @@ -232,7 +236,7 @@ public final class Fulltext { this.urlIndexFile.clear(); } this.statsDump = null; - this.solrInstances.getDefaultConnector().commit(true); + this.commit(true); } public void clearLocalSolr() throws IOException { @@ -240,6 +244,7 @@ public final class Fulltext { if (instance != null) { for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear(); } + this.commit(false); this.solrInstances.clearCache(); } @@ -255,11 +260,19 @@ public final class Fulltext { * get the size of the default index * @return */ - public int size() { - int size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); - size += this.solrInstances.getDefaultConnector().getSize(); + public long collectionSize() { + long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); + size += this.getDefaultConnector().getSize(); return size; } + + /** + * get the size of the webgraph index + * @return + */ + public long webgraphSize() { + return this.getWebgraphConnector().getSize(); + } public void close() { this.statsDump = null; @@ -279,7 +292,7 @@ public final class Fulltext { if (urlHash == null) return null; Date x; try { - x = (Date) this.solrInstances.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName()); + x = (Date) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName()); } catch (IOException e) { return null; } @@ -290,7 +303,7 @@ public final class Fulltext { if (urlHash == null) return null; String x; try { - x = (String) this.solrInstances.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName()); + x = (String) this.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName()); } catch (IOException e) { return null; } @@ -317,7 +330,7 @@ public final class Fulltext { // get the metadata from Solr try { - SolrDocument doc = this.solrInstances.getDefaultConnector().getById(ASCII.String(urlHash)); + SolrDocument doc = this.getDefaultConnector().getById(ASCII.String(urlHash)); if (doc != null) { if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); return new URIMetadataNode(doc, wre, weight); @@ -346,17 +359,27 @@ public final class Fulltext { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); byte[] idb = ASCII.getBytes(id); try { - if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - Date sdDate = (Date) this.solrInstances.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); - Date docDate = null; - if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { + if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); + Date sdDate = (Date) this.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName()); + Date docDate = null; + if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) { if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) { // ip_s needs a dns lookup which causes blockings during search here - this.solrInstances.getDefaultConnector().add(doc); + this.getDefaultConnector().add(doc); } else synchronized (this.solrInstances) { - this.solrInstances.getDefaultConnector().add(doc); + this.getDefaultConnector().add(doc); } - } + } + } catch (SolrException e) { + throw new IOException(e.getMessage(), e); + } + this.statsDump = null; + if (MemoryControl.shortStatus()) clearCache(); + } + + public void putEdges(final Collection edges) throws IOException { + try { + this.getWebgraphConnector().add(edges); } catch (SolrException e) { throw new IOException(e.getMessage(), e); } @@ -371,13 +394,13 @@ public final class Fulltext { String id = ASCII.String(idb); try { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - SolrDocument sd = this.solrInstances.getDefaultConnector().getById(id); + SolrDocument sd = this.getDefaultConnector().getById(id); if (sd == null || (new URIMetadataNode(sd)).isOlder(row)) { if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) { // ip_s needs a dns lookup which causes blockings during search here - this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); + this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); } else synchronized (this.solrInstances) { - this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); + this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row)); } } } catch (SolrException e) { @@ -397,15 +420,22 @@ public final class Fulltext { public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain assert hosthash.length() == 6; - final String q = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final String collection1Query = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); + final String webgraphQuery = WebgraphSchema.source_host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solrInstances) { - try { - Fulltext.this.solrInstances.getDefaultConnector().deleteByQuery(q); - } catch (IOException e) {} + try {Fulltext.this.getDefaultConnector().deleteByQuery(collection1Query);} catch (IOException e) {} + try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {} } // delete in old metadata structure @@ -443,21 +473,30 @@ public final class Fulltext { }; if (concurrent) t.start(); else { t.run(); - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain - final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final String collectionQuery = + CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); + final String webgraphQuery = + WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + hostname + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? + (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : + "" + ); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solrInstances) { - try { - Fulltext.this.getDefaultConnector().deleteByQuery(q); - } catch (IOException e) {} + try {Fulltext.this.getDefaultConnector().deleteByQuery(collectionQuery);} catch (IOException e) {} + try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {} } // finally remove the line with statistics if (Fulltext.this.statsDump != null) { @@ -475,7 +514,7 @@ public final class Fulltext { }; if (concurrent) t.start(); else { t.run(); - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } @@ -489,12 +528,12 @@ public final class Fulltext { DigestURI uri; try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;} final String host = uri.getHost(); - final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + + final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentQuery(q, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentQuery(collectionQuery, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); try { SolrDocument doc; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -504,7 +543,7 @@ public final class Fulltext { count.incrementAndGet(); } } - if (count.get() > 0) Fulltext.this.getDefaultConnector().commit(true); + if (count.get() > 0) Fulltext.this.commit(true); } catch (InterruptedException e) {} } }; @@ -525,8 +564,9 @@ public final class Fulltext { synchronized (Fulltext.this.solrInstances) { for (byte[] urlHash: deleteIDs) { Fulltext.this.getDefaultConnector().delete(ASCII.String(urlHash)); + Fulltext.this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash)); } - Fulltext.this.getDefaultConnector().commit(true); + Fulltext.this.commit(true); } } catch (final Throwable e) { Log.logException(e); @@ -546,6 +586,7 @@ public final class Fulltext { try { synchronized (this.solrInstances) { this.getDefaultConnector().delete(ASCII.String(urlHash)); + this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash)); } } catch (final Throwable e) { Log.logException(e); @@ -560,11 +601,11 @@ public final class Fulltext { return false; } - public boolean exists(final byte[] urlHash) { + public boolean exists(final String urlHash) { if (urlHash == null) return false; - if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true; + if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true; try { - if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), ASCII.String(urlHash))) return true; + if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), urlHash)) return true; } catch (final Throwable e) { Log.logException(e); } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 94da14db4..55d068a83 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -195,7 +195,7 @@ public class Segment { } public long URLCount() { - return this.fulltext.size(); + return this.fulltext.collectionSize(); } public long RWICount() { @@ -219,7 +219,7 @@ public class Segment { return count; } - public boolean exists(final byte[] urlhash) { + public boolean exists(final String urlhash) { return this.fulltext.exists(urlhash); } @@ -284,16 +284,16 @@ public class Segment { return this.segmentPath; } - private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { + private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { if (anchors == null) return 0; int refCount = 0; // iterate over all outgoing links, this will create a context for those links final byte[] urlhash = url.hash(); final long urldate = urlModified.getTime(); - for (Map.Entry anchorEntry: anchors.entrySet()) { - MultiProtocolURI anchor = anchorEntry.getKey(); - byte[] refhash = DigestURI.toDigestURI(anchor).hash(); + for (Map.Entry anchorEntry: anchors.entrySet()) { + DigestURI anchor = anchorEntry.getKey(); + byte[] refhash = anchor.hash(); //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); if (this.urlCitationIndex != null) try { this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate)); @@ -377,7 +377,7 @@ public class Segment { // DO A SOFT/HARD COMMIT IF NEEDED if (MemoryControl.shortStatus()) { // do a 'hard' commit to flush index caches - this.fulltext.getDefaultConnector().commit(false); + this.fulltext.commit(false); } else { if ( (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_l) && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_unique_b)) || @@ -404,7 +404,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final SolrInputDocument solrInputDoc = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); // FIND OUT IF THIS IS A DOUBLE DOCUMENT for (CollectionSchema[] checkfields: new CollectionSchema[][]{ @@ -414,11 +414,11 @@ public class Segment { CollectionSchema uniquefield = checkfields[1]; if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { // lookup the document with the same signature - long signature = ((Long) solrInputDoc.getField(checkfield.getSolrFieldName()).getValue()).longValue(); + long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue(); try { if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), Long.toString(signature))) { // change unique attribut in content - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); } } catch (IOException e) {} } @@ -434,14 +434,14 @@ public class Segment { // lookup in the index for the same title String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); if (checkstring.length() == 0) { - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); continue uniquecheck; } checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\""); try { if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), checkstring)) { // switch unique attribute in new document - solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + vector.setField(uniquefield.getSolrFieldName(), false); // switch attribute also in all existing documents (which should be exactly only one!) SolrDocumentList docs = this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); for (SolrDocument doc: docs) { @@ -450,7 +450,7 @@ public class Segment { this.fulltext.getDefaultConnector().add(sid); } } else { - solrInputDoc.setField(uniquefield.getSolrFieldName(), true); + vector.setField(uniquefield.getSolrFieldName(), true); } } catch (IOException e) {} } @@ -459,7 +459,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) { int references = this.urlCitationIndex.count(url.hash()); - if (references > 0) solrInputDoc.setField(CollectionSchema.references_i.getSolrFieldName(), references); + if (references > 0) vector.setField(CollectionSchema.references_i.getSolrFieldName(), references); } // STORE TO SOLR @@ -467,7 +467,20 @@ public class Segment { tryloop: for (int i = 0; i < 20; i++) { try { error = null; - this.fulltext.putDocument(solrInputDoc); + this.fulltext.putDocument(vector); + break tryloop; + } catch ( final IOException e ) { + error = "failed to send " + urlNormalform + " to solr"; + Log.logWarning("SOLR", error + e.getMessage()); + if (i == 10) this.fulltext.commit(false); + try {Thread.sleep(1000);} catch (InterruptedException e1) {} + continue tryloop; + } + } + tryloop: for (int i = 0; i < 20; i++) { + try { + error = null; + this.fulltext.putEdges(vector.getWebgraphDocuments()); break tryloop; } catch ( final IOException e ) { error = "failed to send " + urlNormalform + " to solr"; @@ -567,7 +580,7 @@ public class Segment { } // finished - return solrInputDoc; + return vector; } public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 339edd344..f05cb0372 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -46,7 +46,6 @@ import org.apache.solr.client.solrj.SolrQuery.ORDER; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -531,11 +530,11 @@ public final class QueryParams { return this.queryGoal; } - public final Map separateMatches(final Map links) { - final Map matcher = new HashMap(); - final Iterator > i = links.entrySet().iterator(); - Map.Entry entry; - MultiProtocolURI url; + public final Map separateMatches(final Map links) { + final Map matcher = new HashMap(); + final Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + DigestURI url; String anchorText; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 3f01e1e59..37860b6b0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -38,7 +38,6 @@ import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import net.yacy.cora.document.ASCII; @@ -158,7 +157,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrInputDocument metadata2solr(final URIMetadataRow md) { final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(md.url()); + final DigestURI digestURI = md.url(); boolean allAttr = this.isEmpty(); if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, ""); @@ -283,13 +282,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - public SolrInputDocument yacy2solr( + public static class SolrVector extends SolrInputDocument { + private static final long serialVersionUID = -210901881471714939L; + private List webgraphDocuments; + public SolrVector() { + super(); + this.webgraphDocuments = new ArrayList(); + } + public void addWebgraphDocument(SolrInputDocument webgraphDocument) { + this.webgraphDocuments.add(webgraphDocument); + } + public List getWebgraphDocuments() { + return this.webgraphDocuments; + } + } + + public SolrVector yacy2solr( final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language, - IndexCell citations) { + IndexCell citations, + WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema - final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source()); + SolrVector doc = new SolrVector(); + final DigestURI digestURI = document.dc_source(); boolean allAttr = this.isEmpty(); Set processTypes = new LinkedHashSet(); @@ -299,24 +314,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String docurl = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, docurl); + int clickdepth = -1; if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) { if (digestURI.probablyRootURL()) { boolean lc = this.lazy; this.lazy = false; - add(doc, CollectionSchema.clickdepth_i, 0); + clickdepth = 0; this.lazy = lc; } else { // search the citations for references - int clickdepth = -1; try { clickdepth = getClickDepth(citations, digestURI); } catch (IOException e) { add(doc, CollectionSchema.clickdepth_i, -1); } - add(doc, CollectionSchema.clickdepth_i, clickdepth); if (clickdepth < 0 || clickdepth > 1) { processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut } } + add(doc, CollectionSchema.clickdepth_i, clickdepth); } if (allAttr || contains(CollectionSchema.ip_s)) { @@ -415,12 +430,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema - Set inboundLinks = document.inboundLinks(); - Set outboundLinks = document.outboundLinks(); + Set inboundLinks = document.inboundLinks(); + Set outboundLinks = document.outboundLinks(); int c = 0; final Object parser = document.getParserObject(); - Map images = new HashMap(); + Map images = new HashMap(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; images = html.getImages(); @@ -546,11 +561,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // style sheets if (allAttr || contains(CollectionSchema.css_tag_txt)) { - final Map csss = html.getCSS(); + final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; c = 0; - for (final Map.Entry entry: csss.entrySet()) { + for (final Map.Entry entry: csss.entrySet()) { final String cssurl = entry.getKey().toNormalform(false); inboundLinks.remove(cssurl); outboundLinks.remove(cssurl); @@ -567,10 +582,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Scripts if (allAttr || contains(CollectionSchema.scripts_txt)) { - final Set scriptss = html.getScript(); + final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; - for (final MultiProtocolURI u: scriptss) { + for (final DigestURI u: scriptss) { inboundLinks.remove(u); outboundLinks.remove(u); scripts[c++] = u.toNormalform(false); @@ -581,10 +596,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // Frames if (allAttr || contains(CollectionSchema.frames_txt)) { - final Set framess = html.getFrames(); + final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; - for (final MultiProtocolURI u: framess) { + for (final DigestURI u: framess) { inboundLinks.remove(u); outboundLinks.remove(u); frames[c++] = u.toNormalform(false); @@ -595,10 +610,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // IFrames if (allAttr || contains(CollectionSchema.iframes_txt)) { - final Set iframess = html.getIFrames(); + final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; - for (final MultiProtocolURI u: iframess) { + for (final DigestURI u: iframess) { inboundLinks.remove(u); outboundLinks.remove(u); iframes[c++] = u.toNormalform(false); @@ -609,7 +624,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // canonical tag if (allAttr || contains(CollectionSchema.canonical_t)) { - final MultiProtocolURI canonical = html.getCanonical(); + final DigestURI canonical = html.getCanonical(); if (canonical != null) { inboundLinks.remove(canonical); outboundLinks.remove(canonical); @@ -665,104 +680,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); } - // list all links - final Map alllinks = document.getAnchors(); - c = 0; + // statistics about the links if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size()); if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); - final List inboundlinksTag = new ArrayList(inboundLinks.size()); - final List inboundlinksURLProtocol = new ArrayList(inboundLinks.size()); - final List inboundlinksURLStub = new ArrayList(inboundLinks.size()); - final List inboundlinksName = new ArrayList(inboundLinks.size()); - final List inboundlinksRel = new ArrayList(inboundLinks.size()); - final List inboundlinksText = new ArrayList(inboundLinks.size()); - final List inboundlinksTextChars = new ArrayList(inboundLinks.size()); - final List inboundlinksTextWords = new ArrayList(inboundLinks.size()); - final List inboundlinksAltTag = new ArrayList(inboundLinks.size()); - for (final MultiProtocolURI u: inboundLinks) { - final Properties p = alllinks.get(u); - if (p == null) continue; - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = u.toNormalform(false); - final int pr = urls.indexOf("://",0); - inboundlinksURLProtocol.add(urls.substring(0, pr)); - inboundlinksURLStub.add(urls.substring(pr + 3)); - inboundlinksName.add(name.length() > 0 ? name : ""); - inboundlinksRel.add(rel.length() > 0 ? rel : ""); - inboundlinksText.add(text.length() > 0 ? text : ""); - inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); - inboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - inboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + - (name.length() > 0 ? " name=\"" + name + "\"" : "") + - ">" + - ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(u); - inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); - c++; - } - if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, inboundlinksTag); - if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(inboundlinksURLProtocol)); - if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, inboundlinksURLStub); - if (allAttr || contains(CollectionSchema.inboundlinks_name_txt)) add(doc, CollectionSchema.inboundlinks_name_txt, inboundlinksName); - if (allAttr || contains(CollectionSchema.inboundlinks_rel_sxt)) add(doc, CollectionSchema.inboundlinks_rel_sxt, inboundlinksRel); - if (allAttr || contains(CollectionSchema.inboundlinks_relflags_val)) add(doc, CollectionSchema.inboundlinks_relflags_val, relEval(inboundlinksRel)); - if (allAttr || contains(CollectionSchema.inboundlinks_text_txt)) add(doc, CollectionSchema.inboundlinks_text_txt, inboundlinksText); - if (allAttr || contains(CollectionSchema.inboundlinks_text_chars_val)) add(doc, CollectionSchema.inboundlinks_text_chars_val, inboundlinksTextChars); - if (allAttr || contains(CollectionSchema.inboundlinks_text_words_val)) add(doc, CollectionSchema.inboundlinks_text_words_val, inboundlinksTextWords); - if (allAttr || contains(CollectionSchema.inboundlinks_alttag_txt)) add(doc, CollectionSchema.inboundlinks_alttag_txt, inboundlinksAltTag); - - c = 0; if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); - final List outboundlinksTag = new ArrayList(outboundLinks.size()); - final List outboundlinksURLProtocol = new ArrayList(outboundLinks.size()); - final List outboundlinksURLStub = new ArrayList(outboundLinks.size()); - final List outboundlinksName = new ArrayList(outboundLinks.size()); - final List outboundlinksRel = new ArrayList(outboundLinks.size()); - final List outboundlinksTextChars = new ArrayList(outboundLinks.size()); - final List outboundlinksTextWords = new ArrayList(outboundLinks.size()); - final List outboundlinksText = new ArrayList(outboundLinks.size()); - final List outboundlinksAltTag = new ArrayList(outboundLinks.size()); - for (final MultiProtocolURI u: outboundLinks) { - final Properties p = alllinks.get(u); - if (p == null) continue; - final String name = p.getProperty("name", ""); // the name attribute - final String rel = p.getProperty("rel", ""); // the rel-attribute - final String text = p.getProperty("text", ""); // the text between the tag - final String urls = u.toNormalform(false); - final int pr = urls.indexOf("://",0); - outboundlinksURLProtocol.add(urls.substring(0, pr)); - outboundlinksURLStub.add(urls.substring(pr + 3)); - outboundlinksName.add(name.length() > 0 ? name : ""); - outboundlinksRel.add(rel.length() > 0 ? rel : ""); - outboundlinksText.add(text.length() > 0 ? text : ""); - outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); - outboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); - outboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + - (name.length() > 0 ? " name=\"" + name + "\"" : "") + - ">" + - ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(u); - inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); - c++; - } - if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, outboundlinksTag); - if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(outboundlinksURLProtocol)); - if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, outboundlinksURLStub); - if (allAttr || contains(CollectionSchema.outboundlinks_name_txt)) add(doc, CollectionSchema.outboundlinks_name_txt, outboundlinksName); - if (allAttr || contains(CollectionSchema.outboundlinks_rel_sxt)) add(doc, CollectionSchema.outboundlinks_rel_sxt, outboundlinksRel); - if (allAttr || contains(CollectionSchema.outboundlinks_relflags_val)) add(doc, CollectionSchema.outboundlinks_relflags_val, relEval(outboundlinksRel)); - if (allAttr || contains(CollectionSchema.outboundlinks_text_txt)) add(doc, CollectionSchema.outboundlinks_text_txt, outboundlinksText); - if (allAttr || contains(CollectionSchema.outboundlinks_text_chars_val)) add(doc, CollectionSchema.outboundlinks_text_chars_val, outboundlinksTextChars); - if (allAttr || contains(CollectionSchema.outboundlinks_text_words_val)) add(doc, CollectionSchema.outboundlinks_text_words_val, outboundlinksTextWords); - if (allAttr || contains(CollectionSchema.outboundlinks_alttag_txt)) add(doc, CollectionSchema.outboundlinks_alttag_txt, outboundlinksAltTag); - + + // list all links + WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks); + doc.webgraphDocuments.addAll(subgraph.edges); + if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]); + if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); + if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]); + if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, subgraph.tags[1]); + if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); + if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, subgraph.urlStubs[1]); + // charset if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); @@ -896,6 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param rel * @return binary encoded information about rel */ + /* private static List relEval(final List rel) { List il = new ArrayList(rel.size()); for (final String s: rel) { @@ -907,6 +841,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } return il; } + */ /** * register an entry as error document diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 784dedf84..53b635ad2 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -107,9 +107,13 @@ public enum CollectionSchema implements SchemaDeclaration { // bit 12: "unavailable_after" contained in http header properties robots_i(SolrType.num_integer, true, true, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), metagenerator_t(SolrType.text_general, true, true, false, "content of tag"), - inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"), inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"), + inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), + outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), + outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), + /* inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"), inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"), inboundlinks_relflags_val(SolrType.num_integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"), @@ -117,9 +121,6 @@ public enum CollectionSchema implements SchemaDeclaration { inboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of characters"), inboundlinks_text_words_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of words"), inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), - outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), - outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), - outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"), outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"), outboundlinks_relflags_val(SolrType.num_integer, true, true, true, "external links, the rel property of the a-tag, coded binary"), @@ -127,6 +128,7 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of characters"), outboundlinks_text_words_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of words"), outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), + */ images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"), diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index cb78f65e5..573034785 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -27,10 +27,23 @@ package net.yacy.search.schema; import java.io.File; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Date; import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import org.apache.solr.common.SolrInputDocument; + +import net.yacy.cora.document.ASCII; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.CommonPattern; +import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -74,7 +87,178 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } } } + + public static class Subgraph { + public final ArrayList[] tags, urlProtocols, urlStubs; + public final ArrayList edges; + @SuppressWarnings("unchecked") + public Subgraph(int inboundSize, int outboundSize) { + this.tags = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlProtocols = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlStubs = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.edges = new ArrayList(inboundSize + outboundSize); + } + } + + public Subgraph edges( + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final Map alllinks, + final Map images, + final Set inboundLinks, + final Set outboundLinks + ) { + boolean allAttr = this.isEmpty(); + Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); + addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks); + addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks); + return subgraph; + } + + private void addEdges( + final Subgraph subgraph, + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final boolean allAttr, final Map alllinks, final Map images, + final boolean inbound, final Set links) { + for (final DigestURI target_url: links) { + final Properties p = alllinks.get(target_url); + if (p == null) continue; + final String name = p.getProperty("name", ""); // the name attribute + final String text = p.getProperty("text", ""); // the text between the tag + final String rel = p.getProperty("rel", ""); // the rel-attribute + int ioidx = inbound ? 0 : 1; + + // index organization + StringBuilder idi = new StringBuilder(8); + idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase()); + while (idi.length() < 8) idi.insert(0, '0'); + String source_id = ASCII.String(source.hash()); + String target_id = ASCII.String(target_url.hash()); + StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi); + SolrInputDocument edge = new SolrInputDocument(); + add(edge, WebgraphSchema.id, id.toString()); + if (allAttr || contains(WebgraphSchema.load_date_dt)) { + Date loadDate = new Date(); + Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + add(edge, WebgraphSchema.load_date_dt, loadDate); + } + if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); + add(edge, WebgraphSchema.collection_sxt, collections); + // add the source attributes + add(edge, WebgraphSchema.source_id_s, source_id); + final String source_url_string = source.toNormalform(false); + int pr_source = source_url_string.indexOf("://",0); + if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); + if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3)); + Map source_searchpart = source.getSearchpartMap(); + if (source_searchpart == null) { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size()); + if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length()); + String source_host = null; + if ((source_host = source.getHost()) != null) { + String dnc = Domains.getDNC(source_host); + String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host); + if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash()); + if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension()); + if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath()); + if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) { + String[] paths = source.getPaths(); + add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.source_path_folders_sxt, paths); + } + add(edge, WebgraphSchema.source_clickdepth_i, clickdepth); + + // add the source attributes about the target + if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); + if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); + if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); + if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); + if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); + if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); + if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); + String tag = " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""; + subgraph.tags[ioidx].add(tag); + if (allAttr || contains(WebgraphSchema.target_tag_s)) add(edge, WebgraphSchema.target_tag_s, tag); + ImageEntry ientry = images.get(target_url); + String alttext = ientry == null ? "" : ientry.alt(); + if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); + if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); + if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); + + // add the target attributes + add(edge, WebgraphSchema.target_id_s, target_id); + final String target_url_string = target_url.toNormalform(false); + int pr_target = target_url_string.indexOf("://",0); + subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); + if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); + subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); + if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); + Map target_searchpart = target_url.getSearchpartMap(); + if (target_searchpart == null) { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0); + } else { + if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size()); + if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()])); + if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()])); + } + if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length()); + String target_host = null; + if ((target_host = target_url.getHost()) != null) { + String dnc = Domains.getDNC(target_host); + String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1); + int pp = subdomOrga.lastIndexOf('.'); + String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp); + String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1); + if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host); + if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash()); + if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc); + if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga); + if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc); + if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom); + } + if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension()); + if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath()); + if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) { + String[] paths = target_url.getPaths(); + add(edge, WebgraphSchema.target_path_folders_count_i, paths.length); + add(edge, WebgraphSchema.target_path_folders_sxt, paths); + } + add(edge, WebgraphSchema.target_clickdepth_i, clickdepth); + + // add the edge to the subgraph + subgraph.edges.add(edge); + } + } + + /** + * encode a string containing attributes from anchor rel properties binary: + * bit 0: "me" contained in rel + * bit 1: "nofollow" contained in rel + * @param rel + * @return binary encoded information about rel + */ + private static int relEval(final String rels) { + int i = 0; + final String s0 = rels.toLowerCase().trim(); + if ("me".equals(s0)) i += 1; + if ("nofollow".equals(s0)) i += 2; + return i; + } /** * save configuration to file and update enum SolrFields diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 74a21ddf9..4202f2e08 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -30,16 +30,19 @@ import org.apache.solr.common.SolrInputDocument; public enum WebgraphSchema implements SchemaDeclaration { + // index organisation id(SolrType.string, true, true, false, "primary key of document, a combination of (28 characters)"), + last_modified(SolrType.date, true, true, false, "last-modified from http header"), + load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"), collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), + // source information source_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (source)"), - source_url_s(SolrType.string, true, true, false, "the url of the document (source)"), + source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"), + source_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (source)"), source_file_ext_s(SolrType.string, true, true, false, "the file name extension (source)"), - source_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as - tag with anchor text and nofollow (source)"), source_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (source)"), - source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"), - source_path_s(SolrType.string, true, true, true, "path of the url (source)"), + source_path_s(SolrType.string, true, true, false, "path of the url (source)"), source_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (source)"), source_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (source)"), source_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (source)"), @@ -47,12 +50,14 @@ public enum WebgraphSchema implements SchemaDeclaration { source_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url (source)"), source_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), - source_host_s(SolrType.string, true, true, false, "host of the url"), + source_host_s(SolrType.string, true, true, false, "host of the url (source)"), + source_host_id_s(SolrType.string, true, true, false, "id of the host (source)"), source_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"), source_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"), source_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (source)"), source_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (source)"), + // information in the source about the target target_linktext_t(SolrType.text_general, true, true, false, "the text content of the a-tag (in source, but pointing to a target)"), target_linktext_charcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"), target_linktext_wordcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"), @@ -63,14 +68,15 @@ public enum WebgraphSchema implements SchemaDeclaration { target_rel_s(SolrType.string, true, true, false, "the rel property of the a-tag (in source, but pointing to a target)"), target_relflags_i(SolrType.num_integer, true, true, false, "the rel property of the a-tag, coded binary (in source, but pointing to a target)"), + // target information target_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (target)"), - target_url_s(SolrType.string, true, true, false, "the url of the document (target)"), + target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"), + target_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (target)"), target_file_ext_s(SolrType.string, true, true, false, "the file name extension (target)"), target_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as - tag with anchor text and nofollow (target)"), target_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (target)"), - target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"), - target_path_s(SolrType.string, true, true, true, "path of the url (target)"), - target_path_folders_count_i(SolrType.num_integer, true, true, true, "count of all path elements in the url (target)"), + target_path_s(SolrType.string, true, true, false, "path of the url (target)"), + target_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (target)"), target_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (target)"), target_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (target)"), target_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url (target)"), @@ -78,11 +84,14 @@ public enum WebgraphSchema implements SchemaDeclaration { target_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), target_host_s(SolrType.string, true, true, false, "host of the url (target)"), + target_host_id_s(SolrType.string, true, true, false, "id of the host (target)"), target_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"), target_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain (target)"), target_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (target)"), - target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"); - + target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"), + + target_inbound_b(SolrType.bool, true, true, false, "flag shows if the target host is equal to the source host"); + public final static String CORE_NAME = "webgraph"; public final static String VOCABULARY_PREFIX = "vocabulary_"; diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 740290846..351455a12 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -38,7 +38,6 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; @@ -164,20 +163,20 @@ public class MediaSnippet implements Comparable, Comparator computeMediaSnippets(final DigestURI source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks(); else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks(); else if (mediatype == ContentDomain.APP) media = document.getApplinks(); if (media == null) return null; - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; DigestURI url; String desc; final List result = new ArrayList(); while (i.hasNext()) { entry = i.next(); - url = DigestURI.toDigestURI(entry.getKey()); + url = entry.getKey(); desc = entry.getValue(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() + @@ -202,7 +201,7 @@ public class MediaSnippet implements Comparable, Comparator result = new ArrayList(); while (i.hasNext()) { ientry = i.next(); - url = DigestURI.toDigestURI(ientry.url()); + url = ientry.url(); final String u = url.toString(); if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue; diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index c0cec40e4..17fdba890 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -309,7 +309,7 @@ public final class HTTPDProxyHandler { DigestURI url = null; try { - url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp)); + url = HeaderFramework.getRequestURL(conProp); if (log.isFine()) log.logFine(reqID +" GET "+ url); if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); @@ -392,7 +392,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : requestHeader.referer().hash(), "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), @@ -528,7 +528,7 @@ public final class HTTPDProxyHandler { final Request request = new Request( null, url, - requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(), + requestHeader.referer() == null ? null : requestHeader.referer().hash(), "", responseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(),
    DatabaseEntriesDatabaseEntries
    Pages (URLs)Documents
    solr search api
    #[urlpublictextSize]#
    RWIs (Words)Webgraph Edges
    solr search api
    #[webgraphSize]#
    RWIs
    (P2P Chunks)
    #[rwipublictextSize]#