added the generation of 50 (!!) new solr field in the core 'webgraph'.

The default schema uses only some of them and the resting search index
has now the following properties:
- webgraph size will have about 40 times as much entries as default
index
- the complete index size will increase and may be about the double size
of current amount
As testing showed, not much indexing performance is lost. The default
index will be smaller (moved fields out of it); thus searching
can be faster.
The new index will cause that some old parts in YaCy can be removed,
i.e. specialized webgraph data and the noload crawler. The new index
will make it possible to:
- search within link texts of linked but not indexed documents (about 20
times of document index in size!!)
- get a very detailed link graph
- enhance ranking using a complete link graph

To get the full access to the new index, the API to solr has now two
access points: one with attribute core=collection1 for the default
search index and core=webgraph to the new webgraph search index. This is
also avaiable for p2p operation but client access is not yet
implemented.
pull/1/head
Michael Peter Christen 12 years ago
parent 89ede0fe84
commit 788288eb9e

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="source"/>
<classpathentry excluding="api/|env/|processing/domaingraph/applet/|yacy/|api/bookmarks/|api/ymarks/|api/bookmarks/posts/|api/bookmarks/tags/|api/bookmarks/xbel/|solr/|gsa/" kind="src" path="htroot"/>
<classpathentry excluding="api/|env/|processing/domaingraph/applet/|yacy/|api/bookmarks/|api/ymarks/|api/bookmarks/posts/|api/bookmarks/tags/|api/bookmarks/xbel/|solr/|gsa/|solr/collection1/" kind="src" path="htroot"/>
<classpathentry excluding="bookmarks/|ymarks/|bookmarks/posts/|bookmarks/tags/|bookmarks/xbel/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="htroot/yacy"/>
@ -10,7 +10,7 @@
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/solr"/>
<classpathentry excluding="collection1/" kind="src" path="htroot/solr"/>
<classpathentry kind="src" path="htroot/gsa"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>

@ -214,27 +214,6 @@ inboundlinks_protocol_sxt
## internal links, the url only without the protocol
inboundlinks_urlstub_txt
## internal links, the name property of the a-tag
#inboundlinks_name_txt
## internal links, the rel property of the a-tag
#inboundlinks_rel_sxt
## internal links, the rel property of the a-tag, coded binary
#inboundlinks_relflags_val
## internal links, the text content of the a-tag
#inboundlinks_text_txt
## internal links, the length of the a-tag as number of characters
#inboundlinks_text_chars_val
## internal links, the length of the a-tag as number of words
#inboundlinks_text_words_val
##if the link is an image link, this contains the alt tag if the image is also liked as img link
#inboundlinks_alttag_txt
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow
#outboundlinks_tag_txt
@ -244,27 +223,6 @@ outboundlinks_protocol_sxt
## external links, the url only without the protocol
outboundlinks_urlstub_txt
## external links, the name property of the a-tag
#outboundlinks_name_txt
## external links, the rel property of the a-tag
#outboundlinks_rel_sxt
## external links, the rel property of the a-tag, coded binary
#outboundlinks_relflags_val
## external links, the text content of the a-tag
#outboundlinks_text_txt
## external links, the length of the a-tag as number of characters
#outboundlinks_text_chars_val
## external links, the length of the a-tag as number of words
#outboundlinks_text_words_val
##if the link is an image link, this contains the alt tag if the image is also liked as img link
#outboundlinks_alttag_txt
## all image tags, encoded as <img> tag inclusive alt- and title property
#images_tag_txt

@ -15,6 +15,12 @@
## primary key of document, a combination of <source-url-hash><target-url-hash><four-digit-hex-counter> (28 characters)
id
## last-modified from http header, date (mandatory field)
last_modified
## time when resource was loaded
load_date_dt
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
@ -26,21 +32,18 @@ collection_sxt
## primary key of document, the URL hash (source)
source_id_s
## the url of the document (source)
#source_url_s
## the protocol of the url (source)
#source_protocol_s
## the url without the protocol (source)
#source_urlstub_s
## the file name extension (source)
#source_file_ext_s
## normalized (absolute URLs), as <a> - tag with anchor text and nofollow (source)
#source_tag_s
## number of all characters in the url (source)
#source_chars_i
## the protocol of the url (source)
#source_protocol_s
## path of the url (source)
#source_path_s
@ -62,9 +65,12 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
## host of the url
## host of the url (source)
#source_host_s
## id of the host (source)
source_host_id_s
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)
#source_host_dnc_s
@ -117,8 +123,11 @@ target_name_t
## primary key of document, the URL hash (target)
target_id_s
## the url of the document (target)
target_url_s
## the protocol of the url (target)
target_protocol_s
## the url without the protocol (target)
target_urlstub_s
## the file name extension (target)
target_file_ext_s
@ -129,9 +138,6 @@ target_file_ext_s
## number of all characters in the url (target)
#target_chars_i
## the protocol of the url (target)
target_protocol_s
## path of the url (target)
#target_path_s
@ -156,6 +162,9 @@ target_path_folders_sxt
## host of the url (target)
#target_host_s
## id of the host (target)
target_host_id_s
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)
#target_host_dnc_s
@ -168,5 +177,5 @@ target_path_folders_sxt
## the remaining part of the host without organizationdnc (target)
#target_host_subdomain_s
## flag shows if the target host is equal to the source host
target_inbound_b

@ -121,7 +121,8 @@ public class ConfigHeuristics_p {
}
try {
sb.index.fulltext().getDefaultConfiguration().commit();
} catch (IOException ex) {}
} catch (IOException e) {
}
}
}

@ -143,13 +143,12 @@ public class CrawlStartScanner_p
if ( post.containsKey("crawl") ) {
// make a pk/url mapping
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
final Map<byte[], DigestURI> pkmap =
new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
final Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
while (se.hasNext()) {
final Scanner.Service u = se.next().getKey();
DigestURI uu;
try {
uu = DigestURI.toDigestURI(u.url());
uu = u.url();
pkmap.put(uu.hash(), uu);
} catch ( final MalformedURLException e ) {
Log.logException(e);
@ -197,15 +196,14 @@ public class CrawlStartScanner_p
String urlString;
DigestURI u;
try {
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se =
Scanner.scancacheEntries();
final Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<Scanner.Service, Scanner.Access> host;
while ( se.hasNext() ) {
host = se.next();
try {
u = DigestURI.toDigestURI(host.getKey().url());
u = host.getKey().url();
urlString = u.toNormalform(true);
if ( host.getValue() == Access.granted
if (host.getValue() == Access.granted
&& Scanner.inIndex(apiCommentCache, urlString) == null ) {
String path =
"/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99";

@ -21,7 +21,7 @@
#%env/templates/submenuCrawlMonitor.template%#
<h2>Crawler</h2>
<noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript>
<fieldset style="width:270px;height:140px;float:left;">
<fieldset style="width:260px;height:140px;float:left;">
<legend>Queues</legend>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
@ -74,20 +74,24 @@
</tbody>
</table>
</fieldset>
<fieldset style="width:140px;height:140px;float:left;">
<fieldset style="width:180px;height:140px;float:left;">
<legend>Index Size</legend>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
<th>Database</th>
<th>Entries</th>
<th width="100">Database</th>
<th width="80">Entries</th>
</tr>
<tr class="TableCellLight">
<td align="left">Pages (URLs)</td>
<td align="left">Documents<br/><a href="/solr/select?core=collection1&q=*:*&start=0&rows=3">solr search api</a></td>
<td align="right"><span id="urldbsize">#[urlpublictextSize]#</span></td>
</tr>
<tr class="TableCellLight">
<td align="left">RWIs (Words)</td>
<td align="left">Webgraph Edges<br/><a href="/solr/select?core=webgraph&q=*:*&start=0&rows=3">solr search api</a></td>
<td align="right"><span id="webgraphsize">#[webgraphSize]#</span></td>
</tr>
<tr class="TableCellLight">
<td align="left">RWIs<br/>(P2P Chunks)</td>
<td align="right"><span id="rwidbsize">#[rwipublictextSize]#</span></td>
</tr>
</tbody>

@ -36,7 +36,6 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.SpaceExceededException;
@ -73,6 +72,7 @@ public class Crawler_p {
final serverObjects prop = new serverObjects();
prop.put("rejected", 0);
prop.put("urlpublictextSize", 0);
prop.put("webgraphSize", 0);
prop.put("rwipublictextSize", 0);
prop.put("list", "0");
prop.put("loaderSize", 0);
@ -277,8 +277,8 @@ public class Crawler_p {
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
// get links and generate filter
for (MultiProtocolURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(DigestURI.toDigestURI(u));
for (DigestURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(u);
}
} catch (IOException e) {
Log.logException(e);
@ -475,7 +475,7 @@ public class Crawler_p {
writer.close();
// get links and generate filter
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
final Map<DigestURI, Properties> hyperlinks = scraper.getAnchors();
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks.keySet());

@ -79,7 +79,7 @@ public class HostBrowser {
// set default values
prop.put("path", "");
prop.put("result", "");
prop.putNum("ucount", fulltext.size());
prop.putNum("ucount", fulltext.collectionSize());
prop.put("hosts", 0);
prop.put("files", 0);
prop.put("admin", 0);
@ -117,7 +117,7 @@ public class HostBrowser {
String load = post.get("load", "");
boolean wait = false;
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(pathURI.hash())) {
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
@ -136,7 +136,7 @@ public class HostBrowser {
));
prop.put("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) {
if (sb.index.exists(url.hash())) break;
if (sb.index.exists(ASCII.String(url.hash()))) break;
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
} catch (MalformedURLException e) {
@ -480,7 +480,7 @@ public class HostBrowser {
}
// insert constants
prop.putNum("ucount", fulltext.size());
prop.putNum("ucount", fulltext.collectionSize());
// return rewrite properties
return prop;
}

@ -280,7 +280,7 @@ public class IndexControlRWIs_p {
Reference iEntry;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
if (!segment.fulltext().exists(iEntry.urlhash())) {
if (!segment.fulltext().exists(ASCII.String(iEntry.urlhash()))) {
try {
unknownURLEntries.put(iEntry.urlhash());
} catch (final SpaceExceededException e) {

@ -66,7 +66,7 @@ public class IndexControlURLs_p {
prop.put("urlstring", "");
prop.put("urlhash", "");
prop.put("result", "");
prop.putNum("ucount", segment.fulltext().size());
prop.putNum("ucount", segment.fulltext().collectionSize());
prop.put("otherHosts", "");
prop.put("genUrlProfile", 0);
prop.put("statistics", 1);
@ -312,7 +312,7 @@ public class IndexControlURLs_p {
}
// insert constants
prop.putNum("ucount", segment.fulltext().size());
prop.putNum("ucount", segment.fulltext().collectionSize());
// return rewrite properties
return prop;
}

@ -16,12 +16,13 @@
<p>If you use a custom Solr schema you may enter a different field name in the column 'Custom Solr Field Name' of the YaCy default attribute name</p>
<form action="IndexSchema_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
Select a Core:
Select a core:
<select id="core" name="core" onchange='submit()'>
#{cores}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
#{/cores}#
</select>
&nbsp;&nbsp;&nbsp;... the core can be searched at <a href="/solr/select?core=#[core]#&q=*:*&start=0&rows=3">/solr/select?core=#[core]#&q=*:*&start=0&rows=3</a>
</form>
<form action="IndexSchema_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">

@ -51,7 +51,7 @@ public class IndexShare_p {
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", indexSegment.RWICount());
prop.putNum("ucount", indexSegment.fulltext().size());
prop.putNum("ucount", indexSegment.fulltext().collectionSize());
return prop; // be save
}
@ -64,7 +64,7 @@ public class IndexShare_p {
// insert constants
prop.putNum("wcount", indexSegment.RWICount());
prop.putNum("ucount", indexSegment.fulltext().size());
prop.putNum("ucount", indexSegment.fulltext().collectionSize());
// return rewrite properties
return prop;

@ -28,6 +28,7 @@ import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
@ -272,7 +273,7 @@ public class Load_RSS_p {
final RSSMessage message = feed.getMessage(entry.getValue().substring(5));
final DigestURI messageurl = new DigestURI(message.getLink());
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(messageurl.hash()) != null) continue loop;
if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop;
sb.addToIndex(messageurl, null, null);
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
} catch (final IOException e) {
@ -317,7 +318,7 @@ public class Load_RSS_p {
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
prop.put("showitems_item_" + i + "_state", sb.urlExists(messageurl.hash()) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state", sb.urlExists(ASCII.String(messageurl.hash())) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);

@ -63,7 +63,7 @@ public class ServerScannerList {
while (se.hasNext()) {
host = se.next();
try {
u = DigestURI.toDigestURI(host.getKey().url());
u = host.getKey().url();
urlString = u.toNormalform(true);
prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0);
prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash()));

@ -308,7 +308,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors());
dark = (i % 2 == 0);
final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
final Map<DigestURI, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {
@ -432,12 +432,12 @@ public class ViewFile {
final serverObjects prop,
final String[] wordArray,
int c,
final Map<MultiProtocolURI, String> media,
final Map<DigestURI, String> media,
final String type,
boolean dark,
final Map<MultiProtocolURI, Properties> alllinks) {
final Map<DigestURI, Properties> alllinks) {
int i = 0;
for (final Map.Entry<MultiProtocolURI, String> entry : media.entrySet()) {
for (final Map.Entry<DigestURI, String> entry : media.entrySet()) {
final Properties p = alllinks.get(entry.getKey());
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute

@ -127,11 +127,11 @@ public class getpageinfo {
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();
final Set<DigestURI> uris = scraper.getAnchors().keySet();
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final MultiProtocolURI uri: uris) {
for (final DigestURI uri: uris) {
if (uri == null) continue;
links.append(';').append(uri.toNormalform(true));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");

@ -127,11 +127,11 @@ public class getpageinfo_p {
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();
final Set<DigestURI> uris = scraper.getAnchors().keySet();
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final MultiProtocolURI uri: uris) {
for (final DigestURI uri: uris) {
if (uri == null) continue;
links.append(';').append(uri.toNormalform(true));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");

@ -76,7 +76,8 @@ public class status_p {
prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER));
// index size
prop.putNum("urlpublictextSize", segment.fulltext().size());
prop.putNum("urlpublictextSize", segment.fulltext().collectionSize());
prop.putNum("webgraphSize", segment.fulltext().webgraphSize());
prop.putNum("rwipublictextSize", segment.RWICount());
// loader queue

@ -21,6 +21,7 @@
<dbsize>
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
<webgraph>#[webgraphSize]#</webgraph>
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
</dbsize>

@ -30,7 +30,6 @@ import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
@ -111,9 +110,9 @@ public class webstructure {
prop.put("references_documents_0_urle", url == null ? 0 : 1);
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true));
int d = 0;
Iterator<MultiProtocolURI> i = scraper.inboundLinks().iterator();
Iterator<DigestURI> i = scraper.inboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = DigestURI.toDigestURI(i.next());
DigestURI refurl = i.next();
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
@ -122,7 +121,7 @@ public class webstructure {
}
i = scraper.outboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = DigestURI.toDigestURI(i.next());
DigestURI refurl = i.next();
byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);

@ -11,8 +11,9 @@
<li><a href="/yacyinteractive.html" class="MenuItemLink">File Search</a></li>
<li><a href="/HostBrowser.html?hosts=" class="MenuItemLink">Host Browser</a></li>
<!--<li><a href="/yacysearch_location.html" class="MenuItemLink">Location Search</a></li>-->
<li><a href="/solr/select?q=*:*&start=0&rows=3" class="MenuItemLink">Embedded Solr API</a></li>
<li><a href="/gsa/search?q=www&size=3" class="MenuItemLink">Embedded GSA API</a></li>
<li><a href="/solr/select?q=*:*&start=0&rows=3&core=collection1" class="MenuItemLink">Solr Default Core</a></li>
<li><a href="/solr/select?q=*:*&start=0&rows=3&core=webgraph" class="MenuItemLink">Solr Webgraph Core</a></li>
<li><a href="/gsa/search?q=www&size=3" class="MenuItemLink">Google Search API</a></li>
<!--<li><a href="/yacy/ui/" accesskey="s" class="MenuItemLink">Rich Client Search</a></li>-->
<li><a href="/compare_yacy.html?display=1" class="MenuItemLink">Compare Search</a></li>
<li><a href="/ViewFile.html" class="MenuItemLink">URL Viewer</a></li>

@ -169,7 +169,7 @@ public class searchresult {
}
// get the embedded connector
EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector();
EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector();
if (connector == null) return null;
// do the solr request

@ -90,7 +90,9 @@ function handleStatus(){
dbsize=getFirstChild(statusTag, "dbsize");
urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext"));
rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext"));
webgraphSize=getValue(getFirstChild(dbsize, "webgraph"));
document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize;
document.getElementById("webgraphsize").firstChild.nodeValue=webgraphSize;
document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize;
loaderqueue=getFirstChild(statusTag, "loaderqueue");

@ -42,6 +42,8 @@ import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryModifier;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -181,7 +183,8 @@ public class select {
}
// get the embedded connector
EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector();
boolean defaultConnector = post == null || post.get("core", CollectionSchema.CORE_NAME).equals(CollectionSchema.CORE_NAME);
EmbeddedSolrConnector connector = defaultConnector ? sb.index.fulltext().getDefaultEmbeddedConnector() : sb.index.fulltext().getEmbeddedConnector(WebgraphSchema.CORE_NAME);
if (connector == null) return null;
// do the solr request, generate facets if we use a special YaCy format

@ -109,7 +109,7 @@ public final class query {
if (obj.equals("lurlcount")) {
// return the number of all available l-url's
prop.put("response", sb.index.fulltext().size());
prop.put("response", sb.index.fulltext().collectionSize());
return prop;
}

@ -201,7 +201,7 @@ public final class transferRWI {
// check if we need to ask for the corresponding URL
if (!knownURL.has(urlHash) && !unknownURL.has(urlHash)) try {
if (sb.index.fulltext().exists(urlHash)) {
if (sb.index.fulltext().exists(ASCII.String(urlHash))) {
knownURL.put(urlHash);
} else {
unknownURL.put(urlHash);

@ -139,7 +139,7 @@ public final class transferURL {
}
// doublecheck
if (sb.index.exists(lEntry.hash())) {
if (sb.index.exists(ASCII.String(lEntry.hash()))) {
if (Network.log.isFine()) Network.log.logFine("transferURL: double URL '" + lEntry.url() + "' from peer " + otherPeerName);
lEntry = null;
doublecheck++;

@ -175,7 +175,7 @@ public class OpenSearchConnector {
if (sb == null) {
return false;
}
final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultLocalSolrConnector();
final EmbeddedSolrConnector connector = sb.index.fulltext().getDefaultEmbeddedConnector();
// check if needed Solr fields are available (selected)
if (connector == null) {
Log.logSevere("OpenSearchConnector.Discover", "Error on connecting to embedded Solr index");

@ -21,6 +21,7 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@ -44,6 +45,7 @@ import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.ModifiableSolrParams;
@ -285,4 +287,11 @@ public abstract class AbstractSolrConnector implements SolrConnector {
throw new IOException(e.getMessage(), e);
}
}
@Override
public void add(final Collection<SolrInputDocument> solrdocs) throws IOException, SolrException {
for (SolrInputDocument solrdoc: solrdocs) {
add(solrdoc);
}
}
}

@ -58,6 +58,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
private final SearchHandler requestHandler;
private final EmbeddedInstance instance;
private final String coreName;
private SolrCore core;
public EmbeddedSolrConnector(EmbeddedInstance instance) {
@ -68,6 +69,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
this.requestHandler.init(new NamedList<Object>());
this.requestHandler.inform(this.core);
super.init(this.instance.getDefaultServer());
this.coreName = ((EmbeddedSolrServer) this.server).getCoreContainer().getDefaultCoreName();
}
public EmbeddedSolrConnector(EmbeddedInstance instance, String coreName) {
@ -78,6 +80,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
this.requestHandler.init(new NamedList<Object>());
this.requestHandler.inform(this.core);
super.init(this.instance.getServer(coreName));
this.coreName = coreName;
}
public SolrInstance getInstance() {
@ -104,9 +107,8 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
Thread.currentThread().setName("solr query: size");
EmbeddedSolrServer ess = (EmbeddedSolrServer) this.server;
CoreContainer coreContainer = ess.getCoreContainer();
String coreName = coreContainer.getDefaultCoreName();
SolrCore core = coreContainer.getCore(coreName);
if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + coreName);
SolrCore core = coreContainer.getCore(this.coreName);
if (core == null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No such core: " + this.coreName);
try {
SolrParams params = AbstractSolrConnector.catchSuccessQuery;

@ -21,6 +21,7 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
@ -74,8 +75,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/**
* delete entries from solr according the given solr query string
* @param id the url hash of the entry
* @return the number of deletions
* @param querystring
* @throws IOException
*/
public void deleteByQuery(final String querystring) throws IOException;
@ -96,6 +96,15 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws SolrException
*/
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException;
/**
* add a collection of solr input documents
* @param solrdocs
* @throws IOException
* @throws SolrException
*/
public void add(final Collection<SolrInputDocument> solrdoc) throws IOException, SolrException;
/**
* get a field value from solr by given key for the id-field and a field name
* @param key

@ -22,6 +22,8 @@ package net.yacy.cora.federate.solr.connector;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.yacy.kelondro.logging.Log;
@ -197,4 +199,32 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
}
}
@Override
public void add(final Collection<SolrInputDocument> solrdocs) throws IOException, SolrException {
if (this.server == null) return;
try {
for (SolrInputDocument solrdoc : solrdocs) {
if (solrdoc.containsKey("_version_")) solrdoc.setField("_version_",0L); // prevent Solr "version conflict"
}
synchronized (this.server) {
this.server.add(solrdocs, -1);
}
} catch (Throwable e) {
// catches "version conflict for": try this again and delete the document in advance
List<String> ids = new ArrayList<String>();
for (SolrInputDocument solrdoc : solrdocs) ids.add((String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
try {
this.server.deleteById(ids);
} catch (SolrServerException e1) {}
try {
synchronized (this.server) {
this.server.add(solrdocs, -1);
}
} catch (Throwable ee) {
log.warn(e.getMessage() + " IDs=" + ids.toString());
throw new IOException(ee);
}
}
}
}

@ -21,8 +21,8 @@
package net.yacy.cora.federate.solr.instance;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.connector.CachedSolrConnector;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
@ -36,12 +36,16 @@ public class InstanceMirror {
private ShardInstance solr1;
private CachedSolrConnector defaultConnector;
private Map<String, CachedSolrConnector> connectorCache;
private EmbeddedSolrConnector defaultEmbeddedConnector;
private Map<String, EmbeddedSolrConnector> embeddedCache;
public InstanceMirror() {
this.solr0 = null;
this.solr1 = null;
this.defaultConnector = null;
this.connectorCache = new HashMap<String, CachedSolrConnector>();
this.connectorCache = new ConcurrentHashMap<String, CachedSolrConnector>();
this.defaultEmbeddedConnector = null;
this.embeddedCache = new ConcurrentHashMap<String, EmbeddedSolrConnector>();
}
public boolean isConnected0() {
@ -50,8 +54,10 @@ public class InstanceMirror {
public void connect0(EmbeddedInstance c) {
for (SolrConnector connector: connectorCache.values()) connector.close();
this.connectorCache.clear();
this.defaultConnector = null;
this.connectorCache.clear();
this.defaultEmbeddedConnector = null;
this.embeddedCache.clear();
this.solr0 = c;
}
@ -62,8 +68,10 @@ public class InstanceMirror {
public void disconnect0() {
if (this.solr0 == null) return;
for (SolrConnector connector: connectorCache.values()) connector.close();
this.connectorCache.clear();
this.defaultConnector = null;
this.connectorCache.clear();
this.defaultEmbeddedConnector = null;
this.embeddedCache.clear();
this.solr0.close();
this.solr0 = null;
}
@ -74,8 +82,10 @@ public class InstanceMirror {
public void connect1(ShardInstance c) {
for (SolrConnector connector: connectorCache.values()) connector.close();
this.connectorCache.clear();
this.defaultConnector = null;
this.connectorCache.clear();
this.defaultEmbeddedConnector = null;
this.embeddedCache.clear();
this.solr1 = c;
}
@ -86,8 +96,10 @@ public class InstanceMirror {
public void disconnect1() {
if (this.solr1 == null) return;
for (SolrConnector connector: connectorCache.values()) connector.close();
this.connectorCache.clear();
this.defaultConnector = null;
this.connectorCache.clear();
this.defaultEmbeddedConnector = null;
this.embeddedCache.clear();
this.solr1.close();
this.solr1 = null;
}
@ -108,8 +120,23 @@ public class InstanceMirror {
if (this.solr1 != null) return this.solr1.getCoreNames();
return null;
}
public EmbeddedSolrConnector getDefaultEmbeddedConnector() {
if (this.defaultEmbeddedConnector != null) return this.defaultEmbeddedConnector;
this.defaultEmbeddedConnector = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0);
this.embeddedCache.put(this.getDefaultCoreName(), this.defaultEmbeddedConnector);
return this.defaultEmbeddedConnector;
}
public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
EmbeddedSolrConnector ec = this.embeddedCache.get(corename);
if (ec != null) return ec;
ec = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename);
this.embeddedCache.put(corename, ec);
return ec;
}
public SolrConnector getDefaultConnector() {
public SolrConnector getDefaultMirrorConnector() {
if (this.defaultConnector != null) return this.defaultConnector;
String defaultCoreName = this.getDefaultCoreName();
if (defaultCoreName == null) return null;
@ -120,7 +147,7 @@ public class InstanceMirror {
return this.defaultConnector;
}
public SolrConnector getConnector(String corename) {
public SolrConnector getMirrorConnector(String corename) {
CachedSolrConnector msc = this.connectorCache.get(corename);
if (msc != null) return msc;
EmbeddedSolrConnector esc = this.solr0 == null ? null : new EmbeddedSolrConnector(this.solr0, corename);

@ -40,10 +40,10 @@ import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.NumberTools;
import net.yacy.kelondro.data.meta.DigestURI;
/**
@ -560,7 +560,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
theHeader.append("\r\n");
}
public static MultiProtocolURI getRequestURL(final HashMap<String, Object> conProp) throws MalformedURLException {
public static DigestURI getRequestURL(final HashMap<String, Object> conProp) throws MalformedURLException {
String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST);
final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/'
final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given
@ -574,7 +574,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
host = host.substring(0, pos);
}
final MultiProtocolURI url = new MultiProtocolURI("http", host, port, (args == null) ? path : path + "?" + args);
final DigestURI url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args);
return url;
}

@ -25,6 +25,7 @@ import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.data.meta.DigestURI;
public class RequestHeader extends HeaderFramework {
@ -70,11 +71,11 @@ public class RequestHeader extends HeaderFramework {
super(reverseMappingCache, othermap);
}
public MultiProtocolURI referer() {
public DigestURI referer() {
final String referer = get(REFERER, null);
if (referer == null) return null;
try {
return new MultiProtocolURI(referer);
return new DigestURI(referer);
} catch (final MalformedURLException e) {
return null;
}

@ -44,6 +44,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
/**
* a protocol scanner
@ -98,8 +99,8 @@ public class Scanner extends Thread {
//this.hostname = Domains.getHostName(this.inetAddress);
return this.hostname;
}
public MultiProtocolURI url() throws MalformedURLException {
return new MultiProtocolURI(this.protocol.name() + "://" + getHostName() + "/");
public DigestURI url() throws MalformedURLException {
return new DigestURI(this.protocol.name() + "://" + getHostName() + "/");
}
@Override
public String toString() {

@ -191,7 +191,7 @@ public final class CrawlStacker {
}
}
}
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks) {
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<DigestURI, Properties> hyperlinks) {
new Thread() {
@Override
public void run() {
@ -201,12 +201,12 @@ public final class CrawlStacker {
}.start();
}
private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) {
for (final Map.Entry<MultiProtocolURI, Properties> e: hyperlinks.entrySet()) {
private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map<DigestURI, Properties> hyperlinks, final boolean replace) {
for (final Map.Entry<DigestURI, Properties> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = DigestURI.toDigestURI(e.getKey());
final DigestURI url = e.getKey();
final byte[] urlhash = url.hash();
if (replace) {
this.indexSegment.fulltext().remove(urlhash);

@ -61,7 +61,7 @@ public class ResultImages {
if (MemoryControl.shortStatus()) clearQueues();
limitQueues(1000);
final Map<MultiProtocolURI, ImageEntry> images = document.getImages();
final Map<DigestURI, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (image == null || image.url() == null) continue;

@ -28,7 +28,6 @@ import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -152,7 +151,7 @@ public final class HTTPLoader {
}
// normalize URL
final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
@ -172,7 +171,7 @@ public final class HTTPLoader {
}
// check if the url was already indexed
final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash());
final HarvestProcess dbname = this.sb.urlExists(ASCII.String(redirectionUrl.hash()));
if (dbname != null) { // customer request
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString());
@ -293,7 +292,7 @@ public final class HTTPLoader {
}
// normalizing URL
final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
// if we are already doing a shutdown we don't need to retry crawling

@ -28,6 +28,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
@ -90,7 +91,7 @@ public class RSSLoader extends Thread {
try {
final DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(messageurl.hash()) != null) continue loop;
if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
loadCount++;

@ -82,7 +82,7 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
final HarvestProcess dbocc = this.sb.urlExists(nexturlhash);
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);

@ -48,7 +48,6 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag;
@ -134,9 +133,9 @@ public class BookmarkHelper {
int importCount = 0;
Map<MultiProtocolURI, Properties> links = new HashMap<MultiProtocolURI, Properties>();
Map<DigestURI, Properties> links = new HashMap<DigestURI, Properties>();
String title;
MultiProtocolURI url;
DigestURI url;
Bookmark bm;
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
@ -148,14 +147,14 @@ public class BookmarkHelper {
writer.close();
links = scraper.getAnchors();
} catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
for (final Entry<DigestURI, Properties> link: links.entrySet()) {
url = link.getKey();
title = link.getValue().getProperty("name", "");
Log.logInfo("BOOKMARKS", "links.get(url)");
if ("".equals(title)) {//cannot be displayed
title = url.toString();
}
bm = db.new Bookmark(DigestURI.toDigestURI(url));
bm = db.new Bookmark(url);
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
bm.setTags(tags);
bm.setPublic(importPublic);

@ -50,6 +50,7 @@ import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.logging.Log;
@ -113,7 +114,7 @@ public final class Condenser {
// add the URL components to the word list
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
Map.Entry<MultiProtocolURI, String> entry;
Map.Entry<DigestURI, String> entry;
if (indexText) {
createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter:
@ -163,7 +164,7 @@ public final class Condenser {
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
Iterator<Map.Entry<DigestURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);

@ -68,7 +68,7 @@ import net.yacy.kelondro.util.FileUtils;
public class Document {
private final DigestURI source; // the source url
private final DigestURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
@ -78,13 +78,14 @@ public class Document {
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
private final Map<DigestURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<DigestURI, String> rss; // all embedded rss feeds
private final Map<DigestURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks, inboundlinks, outboundlinks;
private Map<DigestURI, String> audiolinks, videolinks, applinks, hyperlinks;
private Map<DigestURI, String> inboundlinks, outboundlinks;
private Map<String, String> emaillinks;
private MultiProtocolURI favicon;
private boolean resorted;
@ -103,9 +104,9 @@ public class Document {
final String[] sections, final String abstrct,
final double lon, final double lat,
final Object text,
final Map<MultiProtocolURI, Properties> anchors,
final Map<MultiProtocolURI, String> rss,
final Map<MultiProtocolURI, ImageEntry> images,
final Map<DigestURI, Properties> anchors,
final Map<DigestURI, String> rss,
final Map<DigestURI, ImageEntry> images,
final boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
@ -120,9 +121,9 @@ public class Document {
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.lon = lon;
this.lat = lat;
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, Properties>(0) : anchors;
this.rss = (rss == null) ? new HashMap<MultiProtocolURI, String>(0) : rss;
this.images = (images == null) ? new HashMap<MultiProtocolURI, ImageEntry>() : images;
this.anchors = (anchors == null) ? new HashMap<DigestURI, Properties>(0) : anchors;
this.rss = (rss == null) ? new HashMap<DigestURI, String>(0) : rss;
this.images = (images == null) ? new HashMap<DigestURI, ImageEntry>() : images;
this.publisher = publisher;
this.hyperlinks = null;
this.audiolinks = null;
@ -397,13 +398,13 @@ dc_rights
return this.keywords;
}
public Map<MultiProtocolURI, Properties> getAnchors() {
public Map<DigestURI, Properties> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return this.anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
public Map<DigestURI, String> getRSS() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return this.rss;
@ -412,30 +413,30 @@ dc_rights
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<MultiProtocolURI, String> getHyperlinks() {
public Map<DigestURI, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!this.resorted) resortLinks();
return this.hyperlinks;
}
public Map<MultiProtocolURI, String> getAudiolinks() {
public Map<DigestURI, String> getAudiolinks() {
if (!this.resorted) resortLinks();
return this.audiolinks;
}
public Map<MultiProtocolURI, String> getVideolinks() {
public Map<DigestURI, String> getVideolinks() {
if (!this.resorted) resortLinks();
return this.videolinks;
}
public Map<MultiProtocolURI, ImageEntry> getImages() {
public Map<DigestURI, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!this.resorted) resortLinks();
return this.images;
}
public Map<MultiProtocolURI, String> getApplinks() {
public Map<DigestURI, String> getApplinks() {
if (!this.resorted) resortLinks();
return this.applinks;
}
@ -459,23 +460,23 @@ dc_rights
synchronized (this) {
if (this.resorted) return;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
MultiProtocolURI url;
DigestURI url;
String u;
int extpos, qpos;
String ext = null;
final String thishost = this.source.getHost();
this.inboundlinks = new HashMap<MultiProtocolURI, String>();
this.outboundlinks = new HashMap<MultiProtocolURI, String>();
this.hyperlinks = new HashMap<MultiProtocolURI, String>();
this.videolinks = new HashMap<MultiProtocolURI, String>();
this.audiolinks = new HashMap<MultiProtocolURI, String>();
this.applinks = new HashMap<MultiProtocolURI, String>();
this.inboundlinks = new HashMap<DigestURI, String>();
this.outboundlinks = new HashMap<DigestURI, String>();
this.hyperlinks = new HashMap<DigestURI, String>();
this.videolinks = new HashMap<DigestURI, String>();
this.audiolinks = new HashMap<DigestURI, String>();
this.applinks = new HashMap<DigestURI, String>();
this.emaillinks = new HashMap<String, String>();
final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<MultiProtocolURI, ImageEntry> entry: collectedImages.entrySet()) {
final Map<DigestURI, ImageEntry> collectedImages = new HashMap<DigestURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<DigestURI, ImageEntry> entry: collectedImages.entrySet()) {
if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
}
for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
for (final Map.Entry<DigestURI, Properties> entry: this.anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0;
@ -585,23 +586,23 @@ dc_rights
return v;
}
private static Map<MultiProtocolURI, String> allReflinks(final Collection<?> links) {
private static Map<DigestURI, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Map<DigestURI, String> v = new HashMap<DigestURI, String>();
final Iterator<?> i = links.iterator();
Object o;
MultiProtocolURI url = null;
DigestURI url = null;
String u;
int pos;
loop: while (i.hasNext())
try {
o = i.next();
if (o instanceof MultiProtocolURI)
url = (MultiProtocolURI) o;
if (o instanceof DigestURI)
url = (DigestURI) o;
else if (o instanceof String)
url = new MultiProtocolURI((String) o);
url = new DigestURI((String) o);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
@ -615,7 +616,7 @@ dc_rights
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new MultiProtocolURI(u);
url = new DigestURI(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -625,7 +626,7 @@ dc_rights
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
url = new MultiProtocolURI(u);
url = new DigestURI(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -689,12 +690,12 @@ dc_rights
return c;
}
public Set<MultiProtocolURI> inboundLinks() {
public Set<DigestURI> inboundLinks() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();
}
public Set<MultiProtocolURI> outboundLinks() {
public Set<DigestURI> outboundLinks() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? null : this.outboundlinks.keySet();
}
@ -764,9 +765,7 @@ dc_rights
* @param docs
* @return
*/
public static Document mergeDocuments(final DigestURI location,
final String globalMime, final Document[] docs)
{
public static Document mergeDocuments(final DigestURI location, final String globalMime, final Document[] docs) {
if (docs == null || docs.length == 0) return null;
if (docs.length == 1) return docs[0];
@ -778,9 +777,9 @@ dc_rights
final StringBuilder description = new StringBuilder(80);
final Collection<String> titles = new LinkedHashSet<String>();
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final Map<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
final Map<DigestURI, String> rss = new HashMap<DigestURI, String>();
final Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
double lon = 0.0d, lat = 0.0d;
for (final Document doc: docs) {
@ -856,22 +855,22 @@ dc_rights
false);
}
public static Map<MultiProtocolURI, String> getHyperlinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
public static Map<DigestURI, String> getHyperlinks(final Document[] documents) {
final Map<DigestURI, String> result = new HashMap<DigestURI, String>();
for (final Document d: documents) {
result.putAll(d.getHyperlinks());
final Object parser = d.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0)try {result.put(new MultiProtocolURI(refresh), "refresh");} catch (MalformedURLException e) {}
if (refresh != null && refresh.length() > 0)try {result.put(new DigestURI(refresh), "refresh");} catch (MalformedURLException e) {}
}
}
return result;
}
public static Map<MultiProtocolURI, String> getImagelinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
public static Map<DigestURI, String> getImagelinks(final Document[] documents) {
final Map<DigestURI, String> result = new HashMap<DigestURI, String>();
for (final Document d: documents) {
for (final ImageEntry imageReference : d.getImages().values()) {
// construct a image name which contains the document title to enhance the search process for images
@ -881,30 +880,30 @@ dc_rights
return result;
}
public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
public static Map<DigestURI, String> getAudiolinks(final Document[] documents) {
final Map<DigestURI, String> result = new HashMap<DigestURI, String>();
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.audiolinks.entrySet()) {
for (Map.Entry<DigestURI, String> e: d.audiolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result;
}
public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
public static Map<DigestURI, String> getVideolinks(final Document[] documents) {
final Map<DigestURI, String> result = new HashMap<DigestURI, String>();
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.videolinks.entrySet()) {
for (Map.Entry<DigestURI, String> e: d.videolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result;
}
public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
public static Map<DigestURI, String> getApplinks(final Document[] documents) {
final Map<DigestURI, String> result = new HashMap<DigestURI, String>();
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.applinks.entrySet()) {
for (Map.Entry<DigestURI, String> e: d.applinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}

@ -53,6 +53,7 @@ import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -121,11 +122,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// class variables: collectors for links
private final Map<MultiProtocolURI, Properties> anchors;
private final Map<MultiProtocolURI, String> rss, css;
private final Set<MultiProtocolURI> script, frames, iframes;
private final Map<MultiProtocolURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<DigestURI, Properties> anchors;
private final Map<DigestURI, String> rss, css;
private final Set<DigestURI> script, frames, iframes;
private final Map<DigestURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<DigestURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private LinkedHashSet<String> titles;
//private String headline;
@ -135,7 +136,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
private MultiProtocolURI canonical;
private DigestURI canonical;
private final int maxLinks;
private int breadcrumbs;
@ -148,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
/**
* The document root {@link MultiProtocolURI}
*/
private MultiProtocolURI root;
private DigestURI root;
/**
* evaluation scores: count appearance of specific attributes
@ -156,7 +157,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Evaluation evaluationScores;
@SuppressWarnings("unchecked")
public ContentScraper(final MultiProtocolURI root, int maxLinks) {
public ContentScraper(final DigestURI root, int maxLinks) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
@ -164,15 +165,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.root = root;
this.maxLinks = maxLinks;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<MultiProtocolURI, String>(maxLinks);
this.css = new SizeLimitedMap<MultiProtocolURI, String>(maxLinks);
this.anchors = new SizeLimitedMap<MultiProtocolURI, Properties>(maxLinks);
this.images = new SizeLimitedMap<MultiProtocolURI, ImageEntry>(maxLinks);
this.embeds = new SizeLimitedMap<MultiProtocolURI, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.iframes = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.rss = new SizeLimitedMap<DigestURI, String>(maxLinks);
this.css = new SizeLimitedMap<DigestURI, String>(maxLinks);
this.anchors = new SizeLimitedMap<DigestURI, Properties>(maxLinks);
this.images = new SizeLimitedMap<DigestURI, ImageEntry>(maxLinks);
this.embeds = new SizeLimitedMap<DigestURI, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<DigestURI>(maxLinks);
this.iframes = new SizeLimitedSet<DigestURI>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.script = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.script = new SizeLimitedSet<DigestURI>(maxLinks);
this.titles = new LinkedHashSet<String>();
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
@ -194,7 +195,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.content.trimToSize();
}
private void mergeAnchors(final MultiProtocolURI url, final Properties p) {
private void mergeAnchors(final DigestURI url, final Properties p) {
final Properties p0 = this.anchors.get(url);
if (p0 == null) {
this.anchors.put(url, p);
@ -282,7 +283,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// find http links inside text
s = 0;
String u;
MultiProtocolURI url;
DigestURI url;
while (s < b.length()) {
p = find(b, dpssp, s);
if (p == Integer.MAX_VALUE) break;
@ -294,7 +295,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 6;
try {
url = new MultiProtocolURI(u);
url = new DigestURI(u);
mergeAnchors(url, new Properties());
continue;
} catch (final MalformedURLException e) {}
@ -317,9 +318,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return (p < 0) ? Integer.MAX_VALUE : p;
}
private MultiProtocolURI absolutePath(final String relativePath) {
private DigestURI absolutePath(final String relativePath) {
try {
return MultiProtocolURI.newURL(this.root, relativePath);
return DigestURI.newURL(this.root, relativePath);
} catch (final Exception e) {
return null;
}
@ -331,7 +332,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
final DigestURI url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
@ -343,10 +344,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) {
try {
this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING));
this.root = new DigestURI(tagopts.getProperty("href", EMPTY_STRING));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src);
@ -384,13 +385,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String href = tagopts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
tagopts.put("nme", areatitle);
MultiProtocolURI url = absolutePath(href);
DigestURI url = absolutePath(href);
tagopts.put("href", url.toNormalform(true));
mergeAnchors(url, tagopts);
}
} else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", EMPTY_STRING);
final MultiProtocolURI newLink = absolutePath(href);
final DigestURI newLink = absolutePath(href);
if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true));
@ -420,7 +421,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
final DigestURI url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
@ -434,12 +435,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
DigestURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
tagopts.put("value", url.toNormalform(true));
mergeAnchors(url, tagopts /* with property "name" */);
}
} else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
final DigestURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src);
@ -459,7 +460,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", EMPTY_STRING);
MultiProtocolURI url;
DigestURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFileName();
final int p = f.lastIndexOf('.');
@ -552,7 +553,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (IOException e) {
}
}
for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
for (final Map.Entry<DigestURI, Properties> entry: scraper.getAnchors().entrySet()) {
mergeAnchors(entry.getKey(), entry.getValue());
}
this.images.putAll(scraper.images);
@ -640,15 +641,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.li.toArray(new String[this.li.size()]);
}
public MultiProtocolURI[] getFlash() {
public DigestURI[] getFlash() {
String ext;
ArrayList<MultiProtocolURI> f = new ArrayList<MultiProtocolURI>();
for (final MultiProtocolURI url: this.anchors.keySet()) {
ArrayList<DigestURI> f = new ArrayList<DigestURI>();
for (final DigestURI url: this.anchors.keySet()) {
ext = url.getFileExtension();
if (ext == null) continue;
if (ext.equals("swf")) f.add(url);
}
return f.toArray(new MultiProtocolURI[f.size()]);
return f.toArray(new DigestURI[f.size()]);
}
public boolean containsFlash() {
@ -674,36 +675,36 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public Map<MultiProtocolURI, Properties> getAnchors() {
public Map<DigestURI, Properties> getAnchors() {
// returns a url (String) / name (String) relation
return this.anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
public Map<DigestURI, String> getRSS() {
// returns a url (String) / name (String) relation
return this.rss;
}
public Map<MultiProtocolURI, String> getCSS() {
public Map<DigestURI, String> getCSS() {
// returns a url (String) / name (String) relation
return this.css;
}
public Set<MultiProtocolURI> getFrames() {
public Set<DigestURI> getFrames() {
// returns a url (String) / name (String) relation
return this.frames;
}
public Set<MultiProtocolURI> getIFrames() {
public Set<DigestURI> getIFrames() {
// returns a url (String) / name (String) relation
return this.iframes;
}
public Set<MultiProtocolURI> getScript() {
public Set<DigestURI> getScript() {
return this.script;
}
public MultiProtocolURI getCanonical() {
public DigestURI getCanonical() {
return this.canonical;
}
@ -711,11 +712,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* get all images
* @return a map of <urlhash, ImageEntry>
*/
public Map<MultiProtocolURI, ImageEntry> getImages() {
public Map<DigestURI, ImageEntry> getImages() {
return this.images;
}
public Map<MultiProtocolURI, EmbedEntry> getEmbeds() {
public Map<DigestURI, EmbedEntry> getEmbeds() {
return this.embeds;
}
@ -970,29 +971,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false, maxLinks);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURI("http://localhost"),null,false, maxLinks);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"), maxLinks);
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost"), maxLinks);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();
return scraper;
}
public static void addAllImages(final Map<MultiProtocolURI, ImageEntry> a, final Map<MultiProtocolURI, ImageEntry> b) {
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<MultiProtocolURI, ImageEntry> ie;
public static void addAllImages(final Map<DigestURI, ImageEntry> a, final Map<DigestURI, ImageEntry> b) {
final Iterator<Map.Entry<DigestURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<DigestURI, ImageEntry> ie;
while (i.hasNext()) {
ie = i.next();
addImage(a, ie.getValue());
}
}
public static void addImage(final Map<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
public static void addImage(final Map<DigestURI, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(ie.url())) {
// in case of a collision, take that image that has the better image size tags
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie);

@ -20,15 +20,15 @@
package net.yacy.document.parser.html;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.data.meta.DigestURI;
public class EmbedEntry {
private final MultiProtocolURI url;
private final DigestURI url;
private final int width, height;
private final String type, pluginspage;
public EmbedEntry(final MultiProtocolURI url, int width, int height, String type, String pluginspage) {
public EmbedEntry(final DigestURI url, int width, int height, String type, String pluginspage) {
this.url = url;
this.width = width;
this.height = height;
@ -36,7 +36,7 @@ public class EmbedEntry {
this.pluginspage = pluginspage;
}
public MultiProtocolURI getUrl() {
public DigestURI getUrl() {
return this.url;
}

@ -26,16 +26,16 @@ package net.yacy.document.parser.html;
import java.util.Comparator;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.data.meta.DigestURI;
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
private final MultiProtocolURI url;
private final DigestURI url;
private final String alt;
private final int width, height;
private final long fileSize;
public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) {
public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) {
assert url != null;
this.url = url;
this.alt = alt;
@ -44,7 +44,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
this.fileSize = fileSize;
}
public MultiProtocolURI url() {
public DigestURI url() {
return this.url;
}

@ -35,7 +35,7 @@ import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.data.meta.DigestURI;
public class ScraperInputStream extends InputStream implements ScraperListener {
@ -58,7 +58,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final MultiProtocolURI rooturl,
final DigestURI rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int maxLinks

@ -34,7 +34,6 @@ import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -149,7 +148,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
public static ContentScraper parseToScraper(
final MultiProtocolURI location,
final DigestURI location,
final String documentCharset,
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {

@ -184,8 +184,8 @@ public class genericImageParser extends AbstractParser implements Parser {
}
final HashSet<String> languages = new HashSet<String>();
final HashMap<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
final HashMap<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
final HashMap<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
// add this image to the map of images
final String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
@ -223,7 +223,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static ImageInfo parseJavaImage(
final MultiProtocolURI location,
final DigestURI location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
try {
@ -238,7 +238,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static ImageInfo parseJavaImage(
final MultiProtocolURI location,
final DigestURI location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
ii.image = image;
@ -275,12 +275,12 @@ public class genericImageParser extends AbstractParser implements Parser {
}
public static class ImageInfo {
public MultiProtocolURI location;
public DigestURI location;
public BufferedImage image;
public StringBuilder info;
public int height;
public int width;
public ImageInfo(final MultiProtocolURI location) {
public ImageInfo(final DigestURI location) {
this.location = location;
this.image = null;
this.info = new StringBuilder();

@ -37,7 +37,6 @@ import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.document.AbstractParser;
@ -75,13 +74,13 @@ public class rssParser extends AbstractParser implements Parser {
final List<Document> docs = new ArrayList<Document>();
DigestURI uri;
Set<String> languages;
Map<MultiProtocolURI, Properties> anchors;
Map<DigestURI, Properties> anchors;
Document doc;
for (final Hit item: feed) try {
uri = new DigestURI(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
anchors = new HashMap<MultiProtocolURI, Properties>();
anchors = new HashMap<DigestURI, Properties>();
Properties p = new Properties();
p.put("name", item.getTitle());
anchors.put(uri, p);
@ -102,7 +101,7 @@ public class rssParser extends AbstractParser implements Parser {
null,
anchors,
null,
new HashMap<MultiProtocolURI, ImageEntry>(),
new HashMap<DigestURI, ImageEntry>(),
false);
docs.add(doc);
} catch (MalformedURLException e) {

@ -33,7 +33,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -168,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath));
final DigestURI url = DigestURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());

@ -40,7 +40,6 @@ import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -96,7 +95,7 @@ public class sitemapParser extends AbstractParser implements Parser {
null,
null,
null,
new HashMap<MultiProtocolURI, ImageEntry>(),
new HashMap<DigestURI, ImageEntry>(),
false);
docs.add(doc);
} catch (MalformedURLException e) {

@ -33,7 +33,6 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -79,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser {
final String[] sections = null;
final String abstrct = null;
//TreeSet images = null;
final Map<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
@ -98,7 +97,7 @@ public class swfParser extends AbstractParser implements Parser {
urlnr = Integer.toString(++urls).toString();
final Properties p = new Properties();
p.put("name", urlnr);
anchors.put(new MultiProtocolURI(url), p);
anchors.put(new DigestURI(url), p);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}

@ -33,7 +33,6 @@ import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -90,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp);
subDocs = TextParser.parseSource(DigestURI.newURL(url, "#" + name), mime, null, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -37,7 +37,6 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.document.AbstractParser;
@ -70,7 +69,7 @@ public class vcfParser extends AbstractParser implements Parser {
final StringBuilder parsedTitle = new StringBuilder();
final StringBuilder parsedDataText = new StringBuilder();
final HashMap<String, String> parsedData = new HashMap<String, String>();
final HashMap<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final HashMap<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
final LinkedList<String> parsedNames = new LinkedList<String>();
boolean useLastLine = false;
@ -177,7 +176,7 @@ public class vcfParser extends AbstractParser implements Parser {
parsedData.clear();
} else if (key.toUpperCase().startsWith("URL")) {
try {
final MultiProtocolURI newURL = new MultiProtocolURI(value);
final DigestURI newURL = new DigestURI(value);
final Properties p = new Properties();
p.put("name", newURL.toString());
anchors.put(newURL, p);

@ -32,7 +32,6 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -87,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name));
final DigestURI virtualURL = DigestURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue;

@ -138,15 +138,12 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
* DigestURI from general URI
* @param u
*/
/*
private DigestURI(final MultiProtocolURI u) {
super(u);
this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null;
}
public static DigestURI toDigestURI(MultiProtocolURI u) {
return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u);
}
*/
/**
* DigestURI from general URI, hash already calculated
@ -168,6 +165,23 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
this.hash = null;
}
public static DigestURI newURL(final DigestURI baseURL, String relPath) throws MalformedURLException {
if (relPath.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath;
}
if ((baseURL == null) ||
isHTTP(relPath) ||
isHTTPS(relPath) ||
isFTP(relPath) ||
isFile(relPath) ||
isSMB(relPath)/*||
relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) {
return new DigestURI(relPath);
}
return new DigestURI(baseURL, relPath);
}
private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful
@Override

@ -169,7 +169,7 @@ public class Transmission {
notFoundx.add(e.urlhash());
continue;
}
if (!Transmission.this.segment.fulltext().exists(e.urlhash())) {
if (!Transmission.this.segment.fulltext().exists(ASCII.String(e.urlhash()))) {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
} else {

@ -46,7 +46,6 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.sorting.ClusteredScoreMap;
@ -83,9 +82,9 @@ public class WebStructureGraph {
private static class LearnObject {
private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs;
private final Set<DigestURI> globalRefURLs;
private LearnObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
private LearnObject(final DigestURI url, final Set<DigestURI> globalRefURLs) {
this.url = url;
this.globalRefURLs = globalRefURLs;
}
@ -160,11 +159,11 @@ public class WebStructureGraph {
public void generateCitationReference(final DigestURI url, final Document document) {
// generate citation reference
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
final Map<DigestURI, String> hl = document.getHyperlinks();
final Iterator<DigestURI> it = hl.keySet().iterator();
final HashSet<DigestURI> globalRefURLs = new HashSet<DigestURI>();
final String refhost = url.getHost();
MultiProtocolURI u;
DigestURI u;
int maxref = 1000;
while ( it.hasNext() && maxref-- > 0 ) {
u = it.next();
@ -191,7 +190,7 @@ public class WebStructureGraph {
}
public void generateCitationReference(final DigestURI from, final DigestURI to) {
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
final HashSet<DigestURI> globalRefURLs = new HashSet<DigestURI>();
final String refhost = from.getHost();
if (refhost != null && to.getHost() != null && !to.getHost().equals(refhost)) globalRefURLs.add(to);
final LearnObject lro = new LearnObject(from, globalRefURLs);
@ -586,12 +585,10 @@ public class WebStructureGraph {
private void learnrefs(final LearnObject lro) {
final Set<String> refhosts = new HashSet<String>();
DigestURI du;
String hosthash;
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
for ( final DigestURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
du = DigestURI.toDigestURI(u);
hosthash = ASCII.String(du.hash(), 6, 6);
hosthash = ASCII.String(u.hash(), 6, 6);
if (!exists(hosthash)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {

@ -245,10 +245,10 @@ public final class yacyRelease extends yacyVersion {
}
// analyze links in scraper resource, and find link to latest release in it
final Map<MultiProtocolURI, Properties> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final Map<DigestURI, Properties> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
for (final MultiProtocolURI url : anchors.keySet()) {
for (final DigestURI url : anchors.keySet()) {
try {
final yacyRelease release = new yacyRelease(url, location.getPublicKey());
//System.out.println("r " + release.toAnchor());

@ -39,7 +39,6 @@ import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
@ -381,7 +380,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException {
public final Map<DigestURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();

@ -1545,12 +1545,12 @@ public final class Switchboard extends serverSwitch {
return false;
}
public HarvestProcess urlExists(final byte[] hash) {
public HarvestProcess urlExists(final String hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (this.index.exists(hash)) return HarvestProcess.LOADED;
return this.crawlQueues.urlExists(hash);
return this.crawlQueues.urlExists(ASCII.getBytes(hash));
}
public void urlRemove(final Segment segment, final byte[] hash) {
@ -2494,7 +2494,7 @@ public final class Switchboard extends serverSwitch {
)
) {
// get the hyperlinks
final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
final Map<DigestURI, String> hl = Document.getHyperlinks(documents);
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
@ -2506,7 +2506,7 @@ public final class Switchboard extends serverSwitch {
// insert those hyperlinks to the crawler
MultiProtocolURI nextUrl;
for ( final Map.Entry<MultiProtocolURI, String> nextEntry : hl.entrySet() ) {
for ( final Map.Entry<DigestURI, String> nextEntry : hl.entrySet() ) {
// check for interruption
checkInterruption();
@ -2654,7 +2654,7 @@ public final class Switchboard extends serverSwitch {
// CREATE INDEX
final String dc_title = document.dc_title();
final DigestURI url = DigestURI.toDigestURI(document.dc_source());
final DigestURI url = document.dc_source();
final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
@ -2711,14 +2711,14 @@ public final class Switchboard extends serverSwitch {
feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url(), ASCII.String(queueEntry.url().hash())));
// store rss feeds in document into rss table
for ( final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet() ) {
for ( final Map.Entry<DigestURI, String> rssEntry : document.getRSS().entrySet() ) {
final Tables.Data rssRow = new Tables.Data();
rssRow.put("referrer", url.hash());
rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true)));
rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date());
try {
this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow);
this.tables.update("rss", rssEntry.getKey().hash(), rssRow);
} catch ( final IOException e ) {
Log.logException(e);
}
@ -2760,7 +2760,7 @@ public final class Switchboard extends serverSwitch {
public final void addAllToIndex(
final DigestURI url,
final Map<MultiProtocolURI, String> links,
final Map<DigestURI, String> links,
final SearchEvent searchEvent,
final String heuristicName) {
@ -2775,10 +2775,10 @@ public final class Switchboard extends serverSwitch {
}
// check if some of the links match with the query
final Map<MultiProtocolURI, String> matcher = searchEvent.query.separateMatches(links);
final Map<DigestURI, String> matcher = searchEvent.query.separateMatches(links);
// take the matcher and load them all
for ( final Map.Entry<MultiProtocolURI, String> entry : matcher.entrySet() ) {
for ( final Map.Entry<DigestURI, String> entry : matcher.entrySet() ) {
try {
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
} catch ( final IOException e ) {
@ -2787,7 +2787,7 @@ public final class Switchboard extends serverSwitch {
}
// take then the no-matcher and load them also
for ( final Map.Entry<MultiProtocolURI, String> entry : links.entrySet() ) {
for ( final Map.Entry<DigestURI, String> entry : links.entrySet() ) {
try {
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
} catch ( final IOException e ) {
@ -2926,10 +2926,10 @@ public final class Switchboard extends serverSwitch {
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName)
throws IOException,
Parser.Failure {
if ( searchEvent != null ) {
if (searchEvent != null) {
searchEvent.addHeuristic(url.hash(), heuristicName, true);
}
if ( this.index.exists(url.hash()) ) {
if (this.index.exists(ASCII.String(url.hash()))) {
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
@ -3004,7 +3004,7 @@ public final class Switchboard extends serverSwitch {
*/
public void addToCrawler(final DigestURI url, final boolean asglobal) {
if ( this.index.exists(url.hash()) ) {
if (this.index.exists(ASCII.String(url.hash()))) {
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
@ -3204,7 +3204,7 @@ public final class Switchboard extends serverSwitch {
return "no DHT distribution: not enabled (per setting)";
}
final Segment indexSegment = this.index;
int size = indexSegment.fulltext().size();
long size = indexSegment.fulltext().collectionSize();
if ( size < 10 ) {
return "no DHT distribution: loadedURL.size() = " + size;
}
@ -3348,12 +3348,12 @@ public final class Switchboard extends serverSwitch {
return;
}
final Map<MultiProtocolURI, String> links;
final Map<DigestURI, String> links;
searchEvent.rankingProcess.oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
if ( links != null ) {
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final Iterator<DigestURI> i = links.keySet().iterator();
while ( i.hasNext() ) {
if ( !i.next().getHost().endsWith(host) ) {
i.remove();
@ -3387,16 +3387,16 @@ public final class Switchboard extends serverSwitch {
return;
}
final Map<MultiProtocolURI, String> links;
final Map<DigestURI, String> links;
DigestURI url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final Iterator<DigestURI> i = links.keySet().iterator();
final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) {
url = DigestURI.toDigestURI(i.next());
url = i.next();
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
@ -3458,11 +3458,11 @@ public final class Switchboard extends serverSwitch {
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
if ( rss != null ) {
final Map<MultiProtocolURI, String> links = new TreeMap<MultiProtocolURI, String>();
MultiProtocolURI uri;
final Map<DigestURI, String> links = new TreeMap<DigestURI, String>();
DigestURI uri;
for ( final RSSMessage message : rss.getFeed() ) {
try {
uri = new MultiProtocolURI(message.getLink());
uri = new DigestURI(message.getLink());
links.put(uri, message.getTitle());
} catch ( final MalformedURLException e ) {
}

@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
@ -195,9 +196,12 @@ public final class Fulltext {
this.solrInstances.disconnect1();
}
public EmbeddedSolrConnector getDefaultLocalSolrConnector() {
if (this.solrInstances.getSolr0() == null) return null;
return new EmbeddedSolrConnector(this.solrInstances.getSolr0());
public EmbeddedSolrConnector getDefaultEmbeddedConnector() {
return this.solrInstances.getDefaultEmbeddedConnector();
}
public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
return this.solrInstances.getEmbeddedConnector(corename);
}
public RemoteSolrConnector getDefaultRemoteSolrConnector() {
@ -210,11 +214,11 @@ public final class Fulltext {
}
public SolrConnector getDefaultConnector() {
return this.solrInstances.getDefaultConnector();
return this.solrInstances.getDefaultMirrorConnector();
}
public SolrConnector getWebgraphConnector() {
return this.solrInstances.getConnector(WebgraphSchema.CORE_NAME);
return this.solrInstances.getMirrorConnector(WebgraphSchema.CORE_NAME);
}
public void clearCache() {
@ -232,7 +236,7 @@ public final class Fulltext {
this.urlIndexFile.clear();
}
this.statsDump = null;
this.solrInstances.getDefaultConnector().commit(true);
this.commit(true);
}
public void clearLocalSolr() throws IOException {
@ -240,6 +244,7 @@ public final class Fulltext {
if (instance != null) {
for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear();
}
this.commit(false);
this.solrInstances.clearCache();
}
@ -255,11 +260,19 @@ public final class Fulltext {
* get the size of the default index
* @return
*/
public int size() {
int size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
size += this.solrInstances.getDefaultConnector().getSize();
public long collectionSize() {
long size = this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
size += this.getDefaultConnector().getSize();
return size;
}
/**
* get the size of the webgraph index
* @return
*/
public long webgraphSize() {
return this.getWebgraphConnector().getSize();
}
public void close() {
this.statsDump = null;
@ -279,7 +292,7 @@ public final class Fulltext {
if (urlHash == null) return null;
Date x;
try {
x = (Date) this.solrInstances.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName());
x = (Date) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName());
} catch (IOException e) {
return null;
}
@ -290,7 +303,7 @@ public final class Fulltext {
if (urlHash == null) return null;
String x;
try {
x = (String) this.solrInstances.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName());
x = (String) this.getDefaultConnector().getFieldById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName());
} catch (IOException e) {
return null;
}
@ -317,7 +330,7 @@ public final class Fulltext {
// get the metadata from Solr
try {
SolrDocument doc = this.solrInstances.getDefaultConnector().getById(ASCII.String(urlHash));
SolrDocument doc = this.getDefaultConnector().getById(ASCII.String(urlHash));
if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash);
return new URIMetadataNode(doc, wre, weight);
@ -346,17 +359,27 @@ public final class Fulltext {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
byte[] idb = ASCII.getBytes(id);
try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
Date sdDate = (Date) this.solrInstances.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName());
Date docDate = null;
if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
Date sdDate = (Date) this.getDefaultConnector().getFieldById(id, CollectionSchema.last_modified.getSolrFieldName());
Date docDate = null;
if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) {
if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) {
// ip_s needs a dns lookup which causes blockings during search here
this.solrInstances.getDefaultConnector().add(doc);
this.getDefaultConnector().add(doc);
} else synchronized (this.solrInstances) {
this.solrInstances.getDefaultConnector().add(doc);
this.getDefaultConnector().add(doc);
}
}
}
} catch (SolrException e) {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
}
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
try {
this.getWebgraphConnector().add(edges);
} catch (SolrException e) {
throw new IOException(e.getMessage(), e);
}
@ -371,13 +394,13 @@ public final class Fulltext {
String id = ASCII.String(idb);
try {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
SolrDocument sd = this.solrInstances.getDefaultConnector().getById(id);
SolrDocument sd = this.getDefaultConnector().getById(id);
if (sd == null || (new URIMetadataNode(sd)).isOlder(row)) {
if (this.collectionConfiguration.contains(CollectionSchema.ip_s)) {
// ip_s needs a dns lookup which causes blockings during search here
this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row));
this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row));
} else synchronized (this.solrInstances) {
this.solrInstances.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row));
this.getDefaultConnector().add(getDefaultConfiguration().metadata2solr(row));
}
}
} catch (SolrException e) {
@ -397,15 +420,22 @@ public final class Fulltext {
public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final String collection1Query = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
final String webgraphQuery = WebgraphSchema.source_host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solrInstances) {
try {
Fulltext.this.solrInstances.getDefaultConnector().deleteByQuery(q);
} catch (IOException e) {}
try {Fulltext.this.getDefaultConnector().deleteByQuery(collection1Query);} catch (IOException e) {}
try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {}
}
// delete in old metadata structure
@ -443,21 +473,30 @@ public final class Fulltext {
};
if (concurrent) t.start(); else {
t.run();
Fulltext.this.getDefaultConnector().commit(true);
Fulltext.this.commit(true);
}
}
public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final String collectionQuery =
CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
final String webgraphQuery =
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solrInstances) {
try {
Fulltext.this.getDefaultConnector().deleteByQuery(q);
} catch (IOException e) {}
try {Fulltext.this.getDefaultConnector().deleteByQuery(collectionQuery);} catch (IOException e) {}
try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (IOException e) {}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
@ -475,7 +514,7 @@ public final class Fulltext {
};
if (concurrent) t.start(); else {
t.run();
Fulltext.this.getDefaultConnector().commit(true);
Fulltext.this.commit(true);
}
}
@ -489,12 +528,12 @@ public final class Fulltext {
DigestURI uri;
try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;}
final String host = uri.getHost();
final String q = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread(){
public void run() {
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentQuery(q, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentQuery(collectionQuery, 0, 1000000, 600000, -1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
try {
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
@ -504,7 +543,7 @@ public final class Fulltext {
count.incrementAndGet();
}
}
if (count.get() > 0) Fulltext.this.getDefaultConnector().commit(true);
if (count.get() > 0) Fulltext.this.commit(true);
} catch (InterruptedException e) {}
}
};
@ -525,8 +564,9 @@ public final class Fulltext {
synchronized (Fulltext.this.solrInstances) {
for (byte[] urlHash: deleteIDs) {
Fulltext.this.getDefaultConnector().delete(ASCII.String(urlHash));
Fulltext.this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash));
}
Fulltext.this.getDefaultConnector().commit(true);
Fulltext.this.commit(true);
}
} catch (final Throwable e) {
Log.logException(e);
@ -546,6 +586,7 @@ public final class Fulltext {
try {
synchronized (this.solrInstances) {
this.getDefaultConnector().delete(ASCII.String(urlHash));
this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":" + ASCII.String(urlHash));
}
} catch (final Throwable e) {
Log.logException(e);
@ -560,11 +601,11 @@ public final class Fulltext {
return false;
}
public boolean exists(final byte[] urlHash) {
public boolean exists(final String urlHash) {
if (urlHash == null) return false;
if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true;
if (this.urlIndexFile != null && this.urlIndexFile.has(ASCII.getBytes(urlHash))) return true;
try {
if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), ASCII.String(urlHash))) return true;
if (this.getDefaultConnector().exists(CollectionSchema.id.getSolrFieldName(), urlHash)) return true;
} catch (final Throwable e) {
Log.logException(e);
}

@ -195,7 +195,7 @@ public class Segment {
}
public long URLCount() {
return this.fulltext.size();
return this.fulltext.collectionSize();
}
public long RWICount() {
@ -219,7 +219,7 @@ public class Segment {
return count;
}
public boolean exists(final byte[] urlhash) {
public boolean exists(final String urlhash) {
return this.fulltext.exists(urlhash);
}
@ -284,16 +284,16 @@ public class Segment {
return this.segmentPath;
}
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<MultiProtocolURI, Properties> anchors) {
private int addCitationIndex(final DigestURI url, final Date urlModified, final Map<DigestURI, Properties> anchors) {
if (anchors == null) return 0;
int refCount = 0;
// iterate over all outgoing links, this will create a context for those links
final byte[] urlhash = url.hash();
final long urldate = urlModified.getTime();
for (Map.Entry<MultiProtocolURI, Properties> anchorEntry: anchors.entrySet()) {
MultiProtocolURI anchor = anchorEntry.getKey();
byte[] refhash = DigestURI.toDigestURI(anchor).hash();
for (Map.Entry<DigestURI, Properties> anchorEntry: anchors.entrySet()) {
DigestURI anchor = anchorEntry.getKey();
byte[] refhash = anchor.hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));
@ -377,7 +377,7 @@ public class Segment {
// DO A SOFT/HARD COMMIT IF NEEDED
if (MemoryControl.shortStatus()) {
// do a 'hard' commit to flush index caches
this.fulltext.getDefaultConnector().commit(false);
this.fulltext.commit(false);
} else {
if (
(this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_l) && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.exact_signature_unique_b)) ||
@ -404,7 +404,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final SolrInputDocument solrInputDoc = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex);
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
for (CollectionSchema[] checkfields: new CollectionSchema[][]{
@ -414,11 +414,11 @@ public class Segment {
CollectionSchema uniquefield = checkfields[1];
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
// lookup the document with the same signature
long signature = ((Long) solrInputDoc.getField(checkfield.getSolrFieldName()).getValue()).longValue();
long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue();
try {
if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), Long.toString(signature))) {
// change unique attribut in content
solrInputDoc.setField(uniquefield.getSolrFieldName(), false);
vector.setField(uniquefield.getSolrFieldName(), false);
}
} catch (IOException e) {}
}
@ -434,14 +434,14 @@ public class Segment {
// lookup in the index for the same title
String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
if (checkstring.length() == 0) {
solrInputDoc.setField(uniquefield.getSolrFieldName(), false);
vector.setField(uniquefield.getSolrFieldName(), false);
continue uniquecheck;
}
checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\"");
try {
if (this.fulltext.getDefaultConnector().exists(checkfield.getSolrFieldName(), checkstring)) {
// switch unique attribute in new document
solrInputDoc.setField(uniquefield.getSolrFieldName(), false);
vector.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentList docs = this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
for (SolrDocument doc: docs) {
@ -450,7 +450,7 @@ public class Segment {
this.fulltext.getDefaultConnector().add(sid);
}
} else {
solrInputDoc.setField(uniquefield.getSolrFieldName(), true);
vector.setField(uniquefield.getSolrFieldName(), true);
}
} catch (IOException e) {}
}
@ -459,7 +459,7 @@ public class Segment {
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) {
int references = this.urlCitationIndex.count(url.hash());
if (references > 0) solrInputDoc.setField(CollectionSchema.references_i.getSolrFieldName(), references);
if (references > 0) vector.setField(CollectionSchema.references_i.getSolrFieldName(), references);
}
// STORE TO SOLR
@ -467,7 +467,20 @@ public class Segment {
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;
this.fulltext.putDocument(solrInputDoc);
this.fulltext.putDocument(vector);
break tryloop;
} catch ( final IOException e ) {
error = "failed to send " + urlNormalform + " to solr";
Log.logWarning("SOLR", error + e.getMessage());
if (i == 10) this.fulltext.commit(false);
try {Thread.sleep(1000);} catch (InterruptedException e1) {}
continue tryloop;
}
}
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;
this.fulltext.putEdges(vector.getWebgraphDocuments());
break tryloop;
} catch ( final IOException e ) {
error = "failed to send " + urlNormalform + " to solr";
@ -567,7 +580,7 @@ public class Segment {
}
// finished
return solrInputDoc;
return vector;
}
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {

@ -46,7 +46,6 @@ import org.apache.solr.client.solrj.SolrQuery.ORDER;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@ -531,11 +530,11 @@ public final class QueryParams {
return this.queryGoal;
}
public final Map<MultiProtocolURI, String> separateMatches(final Map<MultiProtocolURI, String> links) {
final Map<MultiProtocolURI, String> matcher = new HashMap<MultiProtocolURI, String>();
final Iterator <Map.Entry<MultiProtocolURI, String>> i = links.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
MultiProtocolURI url;
public final Map<DigestURI, String> separateMatches(final Map<DigestURI, String> links) {
final Map<DigestURI, String> matcher = new HashMap<DigestURI, String>();
final Iterator <Map.Entry<DigestURI, String>> i = links.entrySet().iterator();
Map.Entry<DigestURI, String> entry;
DigestURI url;
String anchorText;
while (i.hasNext()) {
entry = i.next();

@ -38,7 +38,6 @@ import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.ASCII;
@ -158,7 +157,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public SolrInputDocument metadata2solr(final URIMetadataRow md) {
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = DigestURI.toDigestURI(md.url());
final DigestURI digestURI = md.url();
boolean allAttr = this.isEmpty();
if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, "");
@ -283,13 +282,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrInputDocument yacy2solr(
public static class SolrVector extends SolrInputDocument {
private static final long serialVersionUID = -210901881471714939L;
private List<SolrInputDocument> webgraphDocuments;
public SolrVector() {
super();
this.webgraphDocuments = new ArrayList<SolrInputDocument>();
}
public void addWebgraphDocument(SolrInputDocument webgraphDocument) {
this.webgraphDocuments.add(webgraphDocument);
}
public List<SolrInputDocument> getWebgraphDocuments() {
return this.webgraphDocuments;
}
}
public SolrVector yacy2solr(
final String id, final CrawlProfile profile, final ResponseHeader responseHeader,
final Document document, Condenser condenser, DigestURI referrerURL, String language,
IndexCell<CitationReference> citations) {
IndexCell<CitationReference> citations,
WebgraphConfiguration webgraph) {
// we use the SolrCell design as index schema
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source());
SolrVector doc = new SolrVector();
final DigestURI digestURI = document.dc_source();
boolean allAttr = this.isEmpty();
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
@ -299,24 +314,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String docurl = digestURI.toNormalform(true);
add(doc, CollectionSchema.sku, docurl);
int clickdepth = -1;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(doc, CollectionSchema.clickdepth_i, 0);
clickdepth = 0;
this.lazy = lc;
} else {
// search the citations for references
int clickdepth = -1;
try {
clickdepth = getClickDepth(citations, digestURI);
} catch (IOException e) {
add(doc, CollectionSchema.clickdepth_i, -1);
}
add(doc, CollectionSchema.clickdepth_i, clickdepth);
if (clickdepth < 0 || clickdepth > 1) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
add(doc, CollectionSchema.clickdepth_i, clickdepth);
}
if (allAttr || contains(CollectionSchema.ip_s)) {
@ -415,12 +430,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
Set<MultiProtocolURI> inboundLinks = document.inboundLinks();
Set<MultiProtocolURI> outboundLinks = document.outboundLinks();
Set<DigestURI> inboundLinks = document.inboundLinks();
Set<DigestURI> outboundLinks = document.outboundLinks();
int c = 0;
final Object parser = document.getParserObject();
Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
images = html.getImages();
@ -546,11 +561,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_txt)) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final Map<DigestURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
for (final Map.Entry<DigestURI, String> entry: csss.entrySet()) {
final String cssurl = entry.getKey().toNormalform(false);
inboundLinks.remove(cssurl);
outboundLinks.remove(cssurl);
@ -567,10 +582,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// Scripts
if (allAttr || contains(CollectionSchema.scripts_txt)) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final Set<DigestURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final MultiProtocolURI u: scriptss) {
for (final DigestURI u: scriptss) {
inboundLinks.remove(u);
outboundLinks.remove(u);
scripts[c++] = u.toNormalform(false);
@ -581,10 +596,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// Frames
if (allAttr || contains(CollectionSchema.frames_txt)) {
final Set<MultiProtocolURI> framess = html.getFrames();
final Set<DigestURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final MultiProtocolURI u: framess) {
for (final DigestURI u: framess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
frames[c++] = u.toNormalform(false);
@ -595,10 +610,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// IFrames
if (allAttr || contains(CollectionSchema.iframes_txt)) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final Set<DigestURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final MultiProtocolURI u: iframess) {
for (final DigestURI u: iframess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
iframes[c++] = u.toNormalform(false);
@ -609,7 +624,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_t)) {
final MultiProtocolURI canonical = html.getCanonical();
final DigestURI canonical = html.getCanonical();
if (canonical != null) {
inboundLinks.remove(canonical);
outboundLinks.remove(canonical);
@ -665,104 +680,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = document.getAnchors();
c = 0;
// statistics about the links
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size());
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksName = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksRel = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksText = new ArrayList<String>(inboundLinks.size());
final List<Integer> inboundlinksTextChars = new ArrayList<Integer>(inboundLinks.size());
final List<Integer> inboundlinksTextWords = new ArrayList<Integer>(inboundLinks.size());
final List<String> inboundlinksAltTag = new ArrayList<String>(inboundLinks.size());
for (final MultiProtocolURI u: inboundLinks) {
final Properties p = alllinks.get(u);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = u.toNormalform(false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol.add(urls.substring(0, pr));
inboundlinksURLStub.add(urls.substring(pr + 3));
inboundlinksName.add(name.length() > 0 ? name : "");
inboundlinksRel.add(rel.length() > 0 ? rel : "");
inboundlinksText.add(text.length() > 0 ? text : "");
inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0);
inboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
inboundlinksTag.add(
"<a href=\"" + u.toNormalform(false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>");
ImageEntry ientry = images.get(u);
inboundlinksAltTag.add(ientry == null ? "" : ientry.alt());
c++;
}
if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, inboundlinksTag);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(inboundlinksURLProtocol));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (allAttr || contains(CollectionSchema.inboundlinks_name_txt)) add(doc, CollectionSchema.inboundlinks_name_txt, inboundlinksName);
if (allAttr || contains(CollectionSchema.inboundlinks_rel_sxt)) add(doc, CollectionSchema.inboundlinks_rel_sxt, inboundlinksRel);
if (allAttr || contains(CollectionSchema.inboundlinks_relflags_val)) add(doc, CollectionSchema.inboundlinks_relflags_val, relEval(inboundlinksRel));
if (allAttr || contains(CollectionSchema.inboundlinks_text_txt)) add(doc, CollectionSchema.inboundlinks_text_txt, inboundlinksText);
if (allAttr || contains(CollectionSchema.inboundlinks_text_chars_val)) add(doc, CollectionSchema.inboundlinks_text_chars_val, inboundlinksTextChars);
if (allAttr || contains(CollectionSchema.inboundlinks_text_words_val)) add(doc, CollectionSchema.inboundlinks_text_words_val, inboundlinksTextWords);
if (allAttr || contains(CollectionSchema.inboundlinks_alttag_txt)) add(doc, CollectionSchema.inboundlinks_alttag_txt, inboundlinksAltTag);
c = 0;
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
final List<String> outboundlinksTag = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksURLProtocol = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksURLStub = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksName = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksRel = new ArrayList<String>(outboundLinks.size());
final List<Integer> outboundlinksTextChars = new ArrayList<Integer>(outboundLinks.size());
final List<Integer> outboundlinksTextWords = new ArrayList<Integer>(outboundLinks.size());
final List<String> outboundlinksText = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksAltTag = new ArrayList<String>(outboundLinks.size());
for (final MultiProtocolURI u: outboundLinks) {
final Properties p = alllinks.get(u);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = u.toNormalform(false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol.add(urls.substring(0, pr));
outboundlinksURLStub.add(urls.substring(pr + 3));
outboundlinksName.add(name.length() > 0 ? name : "");
outboundlinksRel.add(rel.length() > 0 ? rel : "");
outboundlinksText.add(text.length() > 0 ? text : "");
outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0);
outboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
outboundlinksTag.add(
"<a href=\"" + u.toNormalform(false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>");
ImageEntry ientry = images.get(u);
inboundlinksAltTag.add(ientry == null ? "" : ientry.alt());
c++;
}
if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, outboundlinksTag);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(outboundlinksURLProtocol));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (allAttr || contains(CollectionSchema.outboundlinks_name_txt)) add(doc, CollectionSchema.outboundlinks_name_txt, outboundlinksName);
if (allAttr || contains(CollectionSchema.outboundlinks_rel_sxt)) add(doc, CollectionSchema.outboundlinks_rel_sxt, outboundlinksRel);
if (allAttr || contains(CollectionSchema.outboundlinks_relflags_val)) add(doc, CollectionSchema.outboundlinks_relflags_val, relEval(outboundlinksRel));
if (allAttr || contains(CollectionSchema.outboundlinks_text_txt)) add(doc, CollectionSchema.outboundlinks_text_txt, outboundlinksText);
if (allAttr || contains(CollectionSchema.outboundlinks_text_chars_val)) add(doc, CollectionSchema.outboundlinks_text_chars_val, outboundlinksTextChars);
if (allAttr || contains(CollectionSchema.outboundlinks_text_words_val)) add(doc, CollectionSchema.outboundlinks_text_words_val, outboundlinksTextWords);
if (allAttr || contains(CollectionSchema.outboundlinks_alttag_txt)) add(doc, CollectionSchema.outboundlinks_alttag_txt, outboundlinksAltTag);
// list all links
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks);
doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.outboundlinks_tag_txt)) add(doc, CollectionSchema.outboundlinks_tag_txt, subgraph.tags[1]);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, subgraph.urlStubs[1]);
// charset
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());
@ -896,6 +829,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param rel
* @return binary encoded information about rel
*/
/*
private static List<Integer> relEval(final List<String> rel) {
List<Integer> il = new ArrayList<Integer>(rel.size());
for (final String s: rel) {
@ -907,6 +841,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
return il;
}
*/
/**
* register an entry as error document

@ -107,9 +107,13 @@ public enum CollectionSchema implements SchemaDeclaration {
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.num_integer, true, true, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
/*
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_val(SolrType.num_integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
@ -117,9 +121,6 @@ public enum CollectionSchema implements SchemaDeclaration {
inboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of characters"),
inboundlinks_text_words_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of words"),
inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_val(SolrType.num_integer, true, true, true, "external links, the rel property of the a-tag, coded binary"),
@ -127,6 +128,7 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of characters"),
outboundlinks_text_words_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of words"),
outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"),
*/
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),

@ -27,10 +27,23 @@ package net.yacy.search.schema;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -74,7 +87,178 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
}
}
public static class Subgraph {
public final ArrayList<String>[] tags, urlProtocols, urlStubs;
public final ArrayList<SolrInputDocument> edges;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.tags = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlProtocols = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlStubs = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.edges = new ArrayList<SolrInputDocument>(inboundSize + outboundSize);
}
}
public Subgraph edges(
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
final Map<DigestURI, Properties> alllinks,
final Map<DigestURI, ImageEntry> images,
final Set<DigestURI> inboundLinks,
final Set<DigestURI> outboundLinks
) {
boolean allAttr = this.isEmpty();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks);
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks);
return subgraph;
}
private void addEdges(
final Subgraph subgraph,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
final boolean allAttr, final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
final boolean inbound, final Set<DigestURI> links) {
for (final DigestURI target_url: links) {
final Properties p = alllinks.get(target_url);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String rel = p.getProperty("rel", ""); // the rel-attribute
int ioidx = inbound ? 0 : 1;
// index organization
StringBuilder idi = new StringBuilder(8);
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
while (idi.length() < 8) idi.insert(0, '0');
String source_id = ASCII.String(source.hash());
String target_id = ASCII.String(target_url.hash());
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
add(edge, WebgraphSchema.collection_sxt, collections);
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
final String source_url_string = source.toNormalform(false);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
Map<String, String> source_searchpart = source.getSearchpartMap();
if (source_searchpart == null) {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
String source_host = null;
if ((source_host = source.getHost()) != null) {
String dnc = Domains.getDNC(source_host);
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension());
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth);
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
String tag = "<a href=\"" + target_url.toNormalform(false) + "\"" + (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + "</a>";
subgraph.tags[ioidx].add(tag);
if (allAttr || contains(WebgraphSchema.target_tag_s)) add(edge, WebgraphSchema.target_tag_s, tag);
ImageEntry ientry = images.get(target_url);
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
String target_host = null;
if ((target_host = target_url.getHost()) != null) {
String dnc = Domains.getDNC(target_host);
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension());
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
add(edge, WebgraphSchema.target_clickdepth_i, clickdepth);
// add the edge to the subgraph
subgraph.edges.add(edge);
}
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel
* bit 1: "nofollow" contained in rel
* @param rel
* @return binary encoded information about rel
*/
private static int relEval(final String rels) {
int i = 0;
final String s0 = rels.toLowerCase().trim();
if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2;
return i;
}
/**
* save configuration to file and update enum SolrFields

@ -30,16 +30,19 @@ import org.apache.solr.common.SolrInputDocument;
public enum WebgraphSchema implements SchemaDeclaration {
// index organisation
id(SolrType.string, true, true, false, "primary key of document, a combination of <source-url-hash><target-url-hash><four-digit-hex-counter> (28 characters)"),
last_modified(SolrType.date, true, true, false, "last-modified from http header"),
load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
// source information
source_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (source)"),
source_url_s(SolrType.string, true, true, false, "the url of the document (source)"),
source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"),
source_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (source)"),
source_file_ext_s(SolrType.string, true, true, false, "the file name extension (source)"),
source_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as <a> - tag with anchor text and nofollow (source)"),
source_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (source)"),
source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"),
source_path_s(SolrType.string, true, true, true, "path of the url (source)"),
source_path_s(SolrType.string, true, true, false, "path of the url (source)"),
source_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (source)"),
source_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (source)"),
@ -47,12 +50,14 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_host_s(SolrType.string, true, true, false, "host of the url"),
source_host_s(SolrType.string, true, true, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, "id of the host (source)"),
source_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"),
source_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
source_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (source)"),
source_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (source)"),
// information in the source about the target
target_linktext_t(SolrType.text_general, true, true, false, "the text content of the a-tag (in source, but pointing to a target)"),
target_linktext_charcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_linktext_wordcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"),
@ -63,14 +68,15 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_rel_s(SolrType.string, true, true, false, "the rel property of the a-tag (in source, but pointing to a target)"),
target_relflags_i(SolrType.num_integer, true, true, false, "the rel property of the a-tag, coded binary (in source, but pointing to a target)"),
// target information
target_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (target)"),
target_url_s(SolrType.string, true, true, false, "the url of the document (target)"),
target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"),
target_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (target)"),
target_file_ext_s(SolrType.string, true, true, false, "the file name extension (target)"),
target_tag_s(SolrType.string, true, true, false, "normalized (absolute URLs), as <a> - tag with anchor text and nofollow (target)"),
target_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (target)"),
target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"),
target_path_s(SolrType.string, true, true, true, "path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer, true, true, true, "count of all path elements in the url (target)"),
target_path_s(SolrType.string, true, true, false, "path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (target)"),
target_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url (target)"),
@ -78,11 +84,14 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_host_s(SolrType.string, true, true, false, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, "id of the host (target)"),
target_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"),
target_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain (target)"),
target_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (target)"),
target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)");
target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"),
target_inbound_b(SolrType.bool, true, true, false, "flag shows if the target host is equal to the source host");
public final static String CORE_NAME = "webgraph";
public final static String VOCABULARY_PREFIX = "vocabulary_";

@ -38,7 +38,6 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
@ -164,20 +163,20 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
public static List<MediaSnippet> computeMediaSnippets(final DigestURI source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) {
if (document == null) return new ArrayList<MediaSnippet>();
Map<MultiProtocolURI, String> media = null;
Map<DigestURI, String> media = null;
if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks();
else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks();
else if (mediatype == ContentDomain.APP) media = document.getApplinks();
if (media == null) return null;
final Iterator<Map.Entry<MultiProtocolURI, String>> i = media.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
final Iterator<Map.Entry<DigestURI, String>> i = media.entrySet().iterator();
Map.Entry<DigestURI, String> entry;
DigestURI url;
String desc;
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
entry = i.next();
url = DigestURI.toDigestURI(entry.getKey());
url = entry.getKey();
desc = entry.getValue();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() +
@ -202,7 +201,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
ientry = i.next();
url = DigestURI.toDigestURI(ientry.url());
url = ientry.url();
final String u = url.toString();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;

@ -309,7 +309,7 @@ public final class HTTPDProxyHandler {
DigestURI url = null;
try {
url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp));
url = HeaderFramework.getRequestURL(conProp);
if (log.isFine()) log.logFine(reqID +" GET "+ url);
if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader);
@ -392,7 +392,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request(
null,
url,
requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
requestHeader.referer() == null ? null : requestHeader.referer().hash(),
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
@ -528,7 +528,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request(
null,
url,
requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
requestHeader.referer() == null ? null : requestHeader.referer().hash(),
"",
responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),

Loading…
Cancel
Save