diff --git a/defaults/yacy.init b/defaults/yacy.init index 70a179959..6dcfb433f 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -98,6 +98,13 @@ filesize.max.other = 8589934591 network.unit.definition = defaults/yacy.network.freeworld.unit #network.unit.definition = defaults/yacy.network.intranet.unit +# distinguish intranet/internet IPs: +# if this setting is set to true, then only URL-Hashes with 'intranet'-Flag is created, even if the +# url is in the internet. This can be done to enhance the crawling speed dramatically since a DNS-lookup +# to check if a host is in the internet oder an intranet can be omited. +# This option is only valid if the network.unit.domain property is set to 'any' +network.unit.domain.nocheck = false + # Update process properties # The update server location is given in the network.unit.definition, # but the settings for update processing and cycles are individual. diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 8b560e589..5dee5527b 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -190,7 +190,7 @@ public class ConfigNetwork_p { prop.putHTML("network.unit.definition", sb.getConfig("network.unit.definition", "")); prop.putHTML("network.unit.name", sb.getConfig(SwitchboardConstants.NETWORK_NAME, "")); prop.putHTML("network.unit.description", sb.getConfig("network.unit.description", "")); - prop.putHTML("network.unit.domain", sb.getConfig("network.unit.domain", "")); + prop.putHTML("network.unit.domain", sb.getConfig(SwitchboardConstants.NETWORK_DOMAIN, "")); prop.putHTML("network.unit.dht", sb.getConfig("network.unit.dht", "")); networkBootstrapLocations.remove(sb.getConfig("network.unit.definition", "")); int c = 0; diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index e463f1ccb..7056a4e03 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -34,6 +34,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -414,7 +415,7 @@ public class Crawler_p { writer.close(); // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); + final Map hyperlinks = scraper.getAnchors(); if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet()); final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString()); @@ -492,7 +493,7 @@ public class Crawler_p { // String description = scraper.getDescription(); // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); + final Map hyperlinks = scraper.getAnchors(); if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet()); // put links onto crawl queue @@ -515,10 +516,10 @@ public class Crawler_p { cachePolicy); sb.crawler.putActive(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - final Iterator> linkiterator = hyperlinks.entrySet().iterator(); + final Iterator> linkiterator = hyperlinks.entrySet().iterator(); DigestURI nexturl; while (linkiterator.hasNext()) { - final Map.Entry e = linkiterator.next(); + final Map.Entry e = linkiterator.next(); if (e.getKey() == null) continue; nexturl = new DigestURI(e.getKey()); // remove the url from the database to be prepared to crawl them again @@ -530,7 +531,7 @@ public class Crawler_p { sb.peers.mySeed().hash.getBytes(), nexturl, null, - e.getValue(), + e.getValue().getProperty("name", ""), new Date(), profile.handle(), 0, diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 8dce3fb82..22e25b2c6 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -35,6 +35,7 @@ import java.net.UnknownHostException; import java.util.Date; import java.util.Iterator; import java.util.Map; +import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; @@ -231,7 +232,7 @@ public final class CrawlStacker { } } } - public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks, boolean replace) { + public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map hyperlinks, boolean replace) { new Thread() { public void run() { enqueueEntries(initiator, profileHandle, hyperlinks, true); @@ -239,8 +240,8 @@ public final class CrawlStacker { }.start(); } - public void enqueueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { - for (Map.Entry e: hyperlinks.entrySet()) { + public void enqueueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { + for (Map.Entry e: hyperlinks.entrySet()) { if (e.getKey() == null) continue; // delete old entry, if exists to force a re-load of the url (thats wanted here) @@ -272,7 +273,7 @@ public final class CrawlStacker { initiator, url, null, - e.getValue(), + e.getValue().getProperty("name", ""), new Date(), profileHandle, 0, diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 056005877..ea7c4d6c8 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -305,7 +305,7 @@ public class RobotsTxt { if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress."); // sending the get request - robotsTxt = client.GETbytes(robotsURL.toString()); + robotsTxt = client.GETbytes(robotsURL); // statistics: if (robotsTxt != null) { ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length); diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 113a5f04f..b5035e1f2 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -53,7 +53,7 @@ public class ZURL implements Iterable { private static final int EcoFSBufferSize = 2000; private static final int maxStackSize = 1000; - public final static Row rowdef = new Row( + private final static Row rowdef = new Row( "String urlhash-" + Word.commonHashLength + ", " + // the url's hash "String executor-" + Word.commonHashLength + ", " + // the crawling executor "Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load @@ -64,8 +64,8 @@ public class ZURL implements Iterable { ); // the class object - protected Index urlIndex; - protected final ConcurrentLinkedQueue stack; + private Index urlIndex; + private final ConcurrentLinkedQueue stack; public ZURL( final File cachePath, diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 65cc0aef7..4e58ee710 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -125,7 +125,7 @@ public final class HTTPLoader { client.setTimout(socketTimeout); client.setHeader(requestHeader.entrySet()); // send request - final byte[] responseBody = client.GETbytes(url.toString(), maxFileSize); + final byte[] responseBody = client.GETbytes(url, maxFileSize); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); @@ -241,7 +241,7 @@ public final class HTTPLoader { final HTTPClient client = new HTTPClient(); client.setTimout(20000); client.setHeader(requestHeader.entrySet()); - final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE); + final byte[] responseBody = client.GETbytes(request.url(), Long.MAX_VALUE); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached diff --git a/source/de/anomic/data/BookmarkHelper.java b/source/de/anomic/data/BookmarkHelper.java index 202d8eff4..788dbfa1d 100644 --- a/source/de/anomic/data/BookmarkHelper.java +++ b/source/de/anomic/data/BookmarkHelper.java @@ -36,6 +36,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -130,7 +131,7 @@ public class BookmarkHelper { int importCount = 0; - Map links = new HashMap(); + Map links = new HashMap(); String title; MultiProtocolURI url; Bookmark bm; @@ -144,9 +145,9 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (final Entry link: links.entrySet()) { + for (final Entry link: links.entrySet()) { url = link.getKey(); - title = link.getValue(); + title = link.getValue().getProperty("name", ""); Log.logInfo("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed title = url.toString(); diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index 739c314a8..5bb42a26f 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -140,6 +140,15 @@ public final class Cache { if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null"); if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null"); log.logInfo("storing content of url " + url.toString() + ", " + file.length + " bytes"); + + // store the file + try { + fileDB.insert(url.hash(), file); + } catch (UnsupportedEncodingException e) { + throw new IOException("Cache.store: cannot write to fileDB (1): " + e.getMessage()); + } catch (IOException e) { + throw new IOException("Cache.store: cannot write to fileDB (2): " + e.getMessage()); + } // store the response header into the header database final HashMap hm = new HashMap(); @@ -154,15 +163,6 @@ public final class Cache { } catch (Exception e) { throw new IOException("Cache.store: cannot write to headerDB: " + e.getMessage()); } - - // store the file - try { - fileDB.insert(url.hash(), file); - } catch (UnsupportedEncodingException e) { - throw new IOException("Cache.store: cannot write to fileDB (1): " + e.getMessage()); - } catch (IOException e) { - throw new IOException("Cache.store: cannot write to fileDB (2): " + e.getMessage()); - } if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false)); } @@ -173,8 +173,11 @@ public final class Cache { */ public static boolean has(final DigestURI url) { boolean headerExists; - headerExists = responseHeaderDB.containsKey(url.hash()); - boolean fileExists = fileDB.containsKey(url.hash()); + boolean fileExists; + //synchronized (responseHeaderDB) { + headerExists = responseHeaderDB.containsKey(url.hash()); + fileExists = fileDB.containsKey(url.hash()); + //} if (headerExists && fileExists) return true; if (!headerExists && !fileExists) return false; // if not both is there then we do a clean-up diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 4f9df3518..b5bbd7a1e 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -342,8 +342,8 @@ public class Segment { Response.docType(document.dc_format()), // doctype condenser.RESULT_FLAGS, // flags UTF8.getBytes(language), // language - document.inboundLinks(), // inbound links - document.outboundLinks(), // outbound links + document.inboundLinkCount(), // inbound links + document.outboundLinkCount(), // outbound links document.getAudiolinks().size(), // laudio document.getImages().size(), // limage document.getVideolinks().size(), // lvideo @@ -363,8 +363,8 @@ public class Segment { condenser, // document condenser language, // document language Response.docType(document.dc_format()), // document type - document.inboundLinks(), // inbound links - document.outboundLinks(), // outbound links + document.inboundLinkCount(), // inbound links + document.outboundLinkCount(), // outbound links searchEvent, // a search event that can have results directly sourceName // the name of the source where the index was created ); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 6b513fe96..7d88f2151 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -574,6 +574,9 @@ public final class Switchboard extends serverSwitch { isGlobalMode(), this.domainList); // Intranet and Global mode may be both true! + // possibly switch off localIP check + Domains.setNoLocalCheck(this.isAllIPMode()); + // check status of account configuration: when local url crawling is allowed, it is not allowed // that an automatic authorization of localhost is done, because in this case crawls from local // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost @@ -828,7 +831,7 @@ public final class Switchboard extends serverSwitch { setConfig(plasmaSwitchboardConstants.INDEX_RECEIVE_ALLOW, true); } */ - MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig("network.unit.domain", "global")); + MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")); } @@ -941,11 +944,13 @@ public final class Switchboard extends serverSwitch { this.crawler, this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, - "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, - "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, + "local.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, + "global.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, this.domainList); } + Domains.setNoLocalCheck(this.isAllIPMode()); // possibly switch off localIP check + // start up crawl jobs continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); @@ -1022,13 +1027,25 @@ public final class Switchboard extends serverSwitch { } public boolean isIntranetMode() { - return "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; + return "local.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0; } public boolean isGlobalMode() { - return "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; + return "global.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0; } + public boolean isAllIPMode() { + return "any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0; + } + + /** + * in nocheck mode the isLocal property is not checked to omit DNS lookup. Can only be done in allip mode + * @return + */ + public boolean isIPNoCheckMode() { + return isAllIPMode() && getConfigBool(SwitchboardConstants.NETWORK_DOMAIN_NOCHECK, false); + } + public boolean isRobinsonMode() { // we are in robinson mode, if we do not exchange index by dht distribution // we need to take care that search requests and remote indexing requests go only @@ -1893,9 +1910,13 @@ public final class Switchboard extends serverSwitch { for (Document doc: in.documents) { try { String id = UTF8.String(new DigestURI(doc.dc_identifier(), null).hash()); - assert id.equals(UTF8.String(in.queueEntry.url().hash())); + String iquh = UTF8.String(in.queueEntry.url().hash()); + if (!id.equals(iquh)) { + log.logWarning("doc=" + id + ":" + doc.dc_identifier() + ", query=" + iquh + ":" + in.queueEntry.url()); + // in case that this happens it appears that the doc id is the right one + } try { - this.solrConnector.add(id, doc); + this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc); } catch (IOException e) { Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage()); } @@ -1951,9 +1972,7 @@ public final class Switchboard extends serverSwitch { assert in.queueEntry != null; assert in.documents != null; assert in.queueEntry != null; - final Integer[] ioLinks = webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i], in.queueEntry.lastModified()); // [outlinksSame, outlinksOther] - in.documents[i].setInboundLinks(ioLinks[0].intValue()); - in.documents[i].setOutboundLinks(ioLinks[1].intValue()); + webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i], in.queueEntry.lastModified()); // [outlinksSame, outlinksOther] } return in; } @@ -2621,7 +2640,7 @@ public final class Switchboard extends serverSwitch { yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)"); } else { ssc++; - final byte[] content = client.GETbytes(url.toString()); + final byte[] content = client.GETbytes(url); enu = FileUtils.strings(content); lc = 0; while (enu.hasNext()) { @@ -2746,7 +2765,7 @@ public final class Switchboard extends serverSwitch { client.setHeader(reqHeader.entrySet()); try { // sending request - final Map result = FileUtils.table(client.GETbytes(url.toString())); + final Map result = FileUtils.table(client.GETbytes(url)); return (result == null) ? new HashMap() : result; } catch (final Exception e) { Log.logException(e); diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index 9fd46ad93..3a1814345 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -395,6 +395,8 @@ public final class SwitchboardConstants { * */ public static final String NETWORK_NAME = "network.unit.name"; + public static final String NETWORK_DOMAIN = "network.unit.domain"; + public static final String NETWORK_DOMAIN_NOCHECK = "network.unit.domain.nocheck"; public static final String NETWORK_WHITELIST = "network.unit.access.whitelist"; public static final String NETWORK_BLACKLIST = "network.unit.access.blacklist"; diff --git a/source/de/anomic/yacy/graphics/WebStructureGraph.java b/source/de/anomic/yacy/graphics/WebStructureGraph.java index 456248017..8a2578187 100644 --- a/source/de/anomic/yacy/graphics/WebStructureGraph.java +++ b/source/de/anomic/yacy/graphics/WebStructureGraph.java @@ -128,39 +128,31 @@ public class WebStructureGraph { } } - public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final DigestURI url, final Document document, final Condenser condenser, final Date docDate) { + public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser, final Date docDate) { // generate citation reference final Map hl = document.getHyperlinks(); final Iterator it = hl.keySet().iterator(); final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); MultiProtocolURI u; - int GCount = 0; - int LCount = 0; while (it.hasNext()) { u = it.next(); if (u == null) continue; - if (refhost != null && u.getHost() != null && u.getHost().equals(refhost)) { - // this is a local link - LCount++; - } else { + if (refhost != null && u.getHost() != null && !u.getHost().equals(refhost)) { // this is a global link - GCount++; globalRefURLs.add(u); } } - + leanrefObject lro = new leanrefObject(url, globalRefURLs); if (globalRefURLs.size() > 0) try { if (this.publicRefDNSResolvingWorker.isAlive()) { - this.publicRefDNSResolvingQueue.put(new leanrefObject(url, globalRefURLs)); + this.publicRefDNSResolvingQueue.put(lro); } else { - this.learnrefs(new leanrefObject(url, globalRefURLs)); + this.learnrefs(lro); } } catch (InterruptedException e) { - this.learnrefs(new leanrefObject(url, globalRefURLs)); + this.learnrefs(lro); } - - return new Integer[] {Integer.valueOf(LCount), Integer.valueOf(GCount)}; } public void learnrefs(final leanrefObject lro) { diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 8805b8542..6644cc9b2 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -39,6 +39,7 @@ import java.security.SignatureException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -239,7 +240,7 @@ public final class yacyRelease extends yacyVersion { } // analyze links in scraper resource, and find link to latest release in it - final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); for (MultiProtocolURI url : anchors.keySet()) { diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index 8720557a1..999a1559e 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -71,6 +71,7 @@ import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.util.MapTools; import net.yacy.kelondro.util.OS; +import de.anomic.search.Switchboard; import de.anomic.tools.bitfield; import de.anomic.tools.crypt; import de.anomic.yacy.dht.FlatWordPartitionScheme; @@ -844,8 +845,9 @@ public class yacySeed implements Cloneable, Comparable, Comparator 0 && ipString.length() < 8) return ipString + " -> IP is too short: "; InetAddress ip = Domains.dnsResolve(ipString); if (ip == null) return ipString + " -> IP is not proper"; //this does not work with staticIP - if (ipString.equals("localhost") || ipString.startsWith("127.") || ipString.startsWith("0:0:0:0:0:0:0:1")) return ipString + " - IP for localhost rejected"; - return null; + if (Switchboard.getSwitchboard().isAllIPMode()) return null; + boolean islocal = Domains.isLocal(ip); + return (!islocal && Switchboard.getSwitchboard().isGlobalMode() || (islocal && Switchboard.getSwitchboard().isIntranetMode())) ? null : ipString + " - IP for localhost rejected"; } @Override diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index ab6396125..3022eb1f3 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -892,7 +892,7 @@ public final class yacySeedDB implements AlternativeDomainNames { byte[] content = null; try { // send request - content = client.GETbytes(seedURL.toString()); + content = client.GETbytes(seedURL); } catch (final Exception e) { throw new IOException("Unable to download seed file '" + seedURL + "'. " + e.getMessage()); } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index ce793afc9..745edc3b2 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -1142,7 +1142,7 @@ public class MultiProtocolURI implements Serializable, Comparable keywords; // most resources provide a keyword field private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result @@ -73,14 +83,149 @@ public enum SolrScheme { private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private Set languages; private boolean indexingDenied; - private float lon, lat; */ solrdoc.addField("title", yacydoc.dc_title()); solrdoc.addField("author", yacydoc.dc_creator()); solrdoc.addField("description", yacydoc.dc_description()); solrdoc.addField("content_type", yacydoc.dc_format()); - solrdoc.addField("subject", yacydoc.dc_subject(' ')); - solrdoc.addField("text", UTF8.String(yacydoc.getTextBytes())); + solrdoc.addField("last_modified", header.lastModified()); + solrdoc.addField("keywords", yacydoc.dc_subject(' ')); + String content = UTF8.String(yacydoc.getTextBytes()); + solrdoc.addField("attr_text", content); + int contentwc = content.split(" ").length; + solrdoc.addField("wordcount_i", contentwc); + + // path elements of link + String path = digestURI.getPath(); + if (path != null) { + String[] paths = path.split("/"); + if (paths.length > 0) solrdoc.addField("attr_paths", paths); + } + + // list all links + Map alllinks = yacydoc.getAnchors(); + int c = 0; + String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; + solrdoc.addField("inboundlinkscount_i", inboundlinks.length); + for (MultiProtocolURI url: yacydoc.inboundLinks()) { + Properties p = alllinks.get(url); + String name = p.getProperty("name", ""); + String rel = p.getProperty("rel", ""); + inboundlinks[c++] = + "" + + ((name.length() > 0) ? name : "") + ""; + } + solrdoc.addField("attr_inboundlinks", inboundlinks); + c = 0; + String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; + solrdoc.addField("outboundlinkscount_i", outboundlinks.length); + for (MultiProtocolURI url: yacydoc.outboundLinks()) { + Properties p = alllinks.get(url); + String name = p.getProperty("name", ""); + String rel = p.getProperty("rel", ""); + outboundlinks[c++] = + "" + + ((name.length() > 0) ? name : "") + ""; + } + solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray()); + + // charset + solrdoc.addField("attr_charset", yacydoc.getCharset()); + + // coordinates + if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { + solrdoc.addField("lon_coordinate", yacydoc.lon()); + solrdoc.addField("lat_coordinate", yacydoc.lat()); + } + solrdoc.addField("attr_httpstatus", "200"); + Object parser = yacydoc.getParserObject(); + if (parser instanceof ContentScraper) { + ContentScraper html = (ContentScraper) parser; + + // header tags + int h = 0; + int f = 1; + for (int i = 1; i <= 6; i++) { + String[] hs = html.getHeadlines(i); + h = h | (hs.length > 0 ? f : 0); + f = f * 2; + solrdoc.addField("attr_h" + i, hs); + } + solrdoc.addField("htags_i", h); + + // meta tags + Map metas = html.getMetas(); + String robots = metas.get("robots"); + if (robots != null) solrdoc.addField("attr_meta_robots", robots); + String generator = metas.get("generator"); + if (generator != null) solrdoc.addField("attr_meta_generator", generator); + + // bold, italic + String[] bold = html.getBold(); + if (bold.length > 0) solrdoc.addField("attr_bold", bold); + String[] italic = html.getItalic(); + if (bold.length > 0) solrdoc.addField("attr_italic", italic); + String[] li = html.getLi(); + solrdoc.addField("licount_i", li.length); + if (li.length > 0) solrdoc.addField("attr_li", li); + + // images + Collection imagesc = html.getImages().values(); + String[] images = new String[imagesc.size()]; + c = 0; + for (ImageEntry ie: imagesc) images[c++] = ie.toString(); + solrdoc.addField("imagescount_i", images.length); + if (images.length > 0) solrdoc.addField("attr_images", images); + + // style sheets + Map csss = html.getCSS(); + String[] css = new String[csss.size()]; + c = 0; + for (Map.Entry entry: csss.entrySet()) { + css[c++] = + ""; + } + solrdoc.addField("csscount_i", css.length); + if (css.length > 0) solrdoc.addField("attr_css", css); + + // Scripts + Set scriptss = html.getScript(); + String[] scripts = new String[scriptss.size()]; + c = 0; + for (MultiProtocolURI url: scriptss) { + scripts[c++] = url.toNormalform(false, false, false, false); + } + solrdoc.addField("scriptscount_i", scripts.length); + if (scripts.length > 0) solrdoc.addField("attr_scripts", scripts); + + // Frames + Set framess = html.getFrames(); + String[] frames = new String[framess.size()]; + c = 0; + for (MultiProtocolURI entry: framess) { + frames[c++] = entry.toNormalform(false, false, false, false); + } + solrdoc.addField("framesscount_i", frames.length); + if (frames.length > 0) solrdoc.addField("attr_frames", frames); + + // IFrames + Set iframess = html.getFrames(); + String[] iframes = new String[iframess.size()]; + c = 0; + for (MultiProtocolURI entry: iframess) { + iframes[c++] = entry.toNormalform(false, false, false, false); + } + solrdoc.addField("iframesscount_i", iframes.length); + if (iframes.length > 0) solrdoc.addField("attr_iframes", iframes); + + // flash embedded + solrdoc.addField("flash_b", html.containsFlash()); + } return solrdoc; } @@ -88,11 +233,7 @@ public enum SolrScheme { /* * standard solr scheme - - - - @@ -100,7 +241,6 @@ public enum SolrScheme { - - - - - - */ } diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index 2e5c8c168..adc0faf15 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -41,6 +41,7 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.protocol.ResponseHeader; import net.yacy.document.Document; import net.yacy.kelondro.logging.Log; @@ -187,12 +188,12 @@ public class SolrSingleConnector { } */ - public void add(String id, Document doc) throws IOException { - add(id, doc, this.scheme); + public void add(String id, ResponseHeader header, Document doc) throws IOException { + add(id, header, doc, this.scheme); } - public void add(String id, Document doc, SolrScheme tempScheme) throws IOException { - SolrInputDocument solrdoc = tempScheme.yacy2solr(id, doc); + public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException { + SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc); int thisrrc = this.transmissionRoundRobinCounter; int nextrrc = thisrrc++; if (nextrrc >= transmissionQueueCount) nextrrc = 0; diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index ec164300b..6ce544a46 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -47,6 +47,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.TreeSet; @@ -72,33 +73,36 @@ public class Document { private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) + private final Map anchors; // all links embedded as clickeable entities (anchor tags) private final Map rss; // all embedded rss feeds private final Map images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks; + private Map hyperlinks, audiolinks, videolinks, applinks, inboundlinks, outboundlinks; private Map emaillinks; private MultiProtocolURI favicon; private boolean resorted; - private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private Set languages; private boolean indexingDenied; private float lon, lat; + private Object parserObject; // the source object that was used to create the Document - public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set languages, + public Document(final MultiProtocolURI location, final String mimeType, final String charset, + final Object parserObject, + final Set languages, final String[] keywords, final String title, final String author, final String publisher, final String[] sections, final String abstrct, final float lon, final float lat, final Object text, - final Map anchors, + final Map anchors, final Map rss, final Map images, boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; + this.parserObject = parserObject; this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); @@ -106,7 +110,7 @@ public class Document { this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.lon = lon; this.lat = lat; - this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.anchors = (anchors == null) ? new HashMap(0) : anchors; this.rss = (rss == null) ? new HashMap(0) : rss; this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; @@ -116,19 +120,15 @@ public class Document { this.applinks = null; this.emaillinks = null; this.resorted = false; - this.inboundLinks = -1; - this.outboundLinks = -1; + this.inboundlinks = null; + this.outboundlinks = null; this.languages = languages; this.indexingDenied = indexingDenied; this.text = text == null ? new ByteArrayOutputStream() : text; } - public void setInboundLinks(int il) { - this.inboundLinks = il; - } - - public void setOutboundLinks(int ol) { - this.outboundLinks = ol; + public Object getParserObject() { + return this.parserObject; } /** @@ -179,8 +179,8 @@ dc_rights public String dc_creator() { return (creator == null) ? "" : creator.toString(); } - - public String dc_subject(final char separator) { + + public String[] dc_subject() { // sort out doubles and empty words final TreeSet hs = new TreeSet(); String s; @@ -189,11 +189,18 @@ dc_rights s = (this.keywords.get(i)).trim(); if (s.length() > 0) hs.add(s.toLowerCase()); } - if (hs.isEmpty()) return ""; + String[] t = new String[hs.size()]; + int i = 0; + for (String u: hs) t[i++] = u; + return t; + } + + public String dc_subject(final char separator) { + String[] t = dc_subject(); + if (t.length == 0) return ""; // generate a new list - final StringBuilder sb = new StringBuilder(this.keywords.size() * 6); - final Iterator i = hs.iterator(); - while (i.hasNext()) sb.append(i.next()).append(separator); + final StringBuilder sb = new StringBuilder(t.length * 8); + for (String s: t) sb.append(s).append(separator); return sb.substring(0, sb.length() - 1); } @@ -314,7 +321,7 @@ dc_rights return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return anchors; @@ -371,72 +378,79 @@ dc_rights return this.lat; } - private synchronized void resortLinks() { + private void resortLinks() { if (this.resorted) return; - - // extract hyperlinks, medialinks and emaillinks from anchorlinks - MultiProtocolURI url; - String u; - int extpos, qpos; - String ext = null; - final Iterator> i = anchors.entrySet().iterator(); - hyperlinks = new HashMap(); - videolinks = new HashMap(); - audiolinks = new HashMap(); - applinks = new HashMap(); - emaillinks = new HashMap(); - final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - Map.Entry entry; - while (i.hasNext()) { - entry = i.next(); - url = entry.getKey(); - if (url == null) continue; - u = url.toNormalform(true, false); - if (u.startsWith("mailto:")) { - emaillinks.put(u.substring(7), entry.getValue()); - } else { - extpos = u.lastIndexOf('.'); - if (extpos > 0) { - if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { - ext = u.substring(extpos + 1, qpos).toLowerCase(); - } else { - ext = u.substring(extpos + 1).toLowerCase(); - } - if (Classification.isMediaExtension(ext)) { - // this is not a normal anchor, its a media link - if (Classification.isImageExtension(ext)) { - ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1, -1)); + synchronized (this) { + if (this.resorted) return; + // extract hyperlinks, medialinks and emaillinks from anchorlinks + MultiProtocolURI url; + String u; + int extpos, qpos; + String ext = null; + String thishost = this.source.getHost(); + this.inboundlinks = new HashMap(); + this.outboundlinks = new HashMap(); + this.hyperlinks = new HashMap(); + this.videolinks = new HashMap(); + this.audiolinks = new HashMap(); + this.applinks = new HashMap(); + this.emaillinks = new HashMap(); + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + for (Map.Entry entry: collectedImages.entrySet()) { + if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); + } + for (Map.Entry entry: anchors.entrySet()) { + url = entry.getKey(); + if (url == null) continue; + if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor"); + u = url.toNormalform(true, false); + String name = entry.getValue().getProperty("name", ""); + if (u.startsWith("mailto:")) { + emaillinks.put(u.substring(7), name); + } else { + extpos = u.lastIndexOf('.'); + if (extpos > 0) { + if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { + ext = u.substring(extpos + 1, qpos).toLowerCase(); + } else { + ext = u.substring(extpos + 1).toLowerCase(); + } + if (Classification.isMediaExtension(ext)) { + // this is not a normal anchor, its a media link + if (Classification.isImageExtension(ext)) { + ContentScraper.addImage(collectedImages, new ImageEntry(url, name, -1, -1, -1)); + } + else if (Classification.isAudioExtension(ext)) audiolinks.put(url, name); + else if (Classification.isVideoExtension(ext)) videolinks.put(url, name); + else if (Classification.isApplicationExtension(ext)) applinks.put(url, name); } - else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue()); - else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue()); - else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue()); } + // in any case we consider this as a link and let the parser decide if that link can be followed + hyperlinks.put(url, name); } - // in any case we consider this as a link and let the parser decide if that link can be followed - hyperlinks.put(url, entry.getValue()); } + + // add image links that we collected from the anchors to the image map + ContentScraper.addAllImages(images, collectedImages); + + // expand the hyperlinks: + // we add artificial hyperlinks to the hyperlink set + // that can be calculated from given hyperlinks and imagelinks + + hyperlinks.putAll(allReflinks(images.values())); + hyperlinks.putAll(allReflinks(audiolinks.keySet())); + hyperlinks.putAll(allReflinks(videolinks.keySet())); + hyperlinks.putAll(allReflinks(applinks.keySet())); + /* + hyperlinks.putAll(allSubpaths(hyperlinks.keySet())); + hyperlinks.putAll(allSubpaths(images.values())); + hyperlinks.putAll(allSubpaths(audiolinks.keySet())); + hyperlinks.putAll(allSubpaths(videolinks.keySet())); + hyperlinks.putAll(allSubpaths(applinks.keySet())); + */ + // don't do this again + this.resorted = true; } - - // add image links that we collected from the anchors to the image map - ContentScraper.addAllImages(images, collectedImages); - - // expand the hyperlinks: - // we add artificial hyperlinks to the hyperlink set - // that can be calculated from given hyperlinks and imagelinks - - hyperlinks.putAll(allReflinks(images.values())); - hyperlinks.putAll(allReflinks(audiolinks.keySet())); - hyperlinks.putAll(allReflinks(videolinks.keySet())); - hyperlinks.putAll(allReflinks(applinks.keySet())); - /* - hyperlinks.putAll(allSubpaths(hyperlinks.keySet())); - hyperlinks.putAll(allSubpaths(images.values())); - hyperlinks.putAll(allSubpaths(audiolinks.keySet())); - hyperlinks.putAll(allSubpaths(videolinks.keySet())); - hyperlinks.putAll(allSubpaths(applinks.keySet())); - */ - // don't do this again - this.resorted = true; } public static Map allSubpaths(final Collection links) { @@ -573,12 +587,24 @@ dc_rights this.favicon = faviconURL; } - public int inboundLinks() { - return (this.inboundLinks < 0) ? 0 : this.inboundLinks; + public int inboundLinkCount() { + if (this.inboundlinks == null) resortLinks(); + return (this.inboundlinks == null) ? 0 : this.inboundlinks.size(); + } + + public int outboundLinkCount() { + if (this.outboundlinks == null) resortLinks(); + return (this.outboundlinks == null) ? 0 : this.outboundlinks.size(); } - public int outboundLinks() { - return (this.outboundLinks < 0) ? 0 : this.outboundLinks; + public Set inboundLinks() { + if (this.inboundlinks == null) resortLinks(); + return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); + } + + public Set outboundLinks() { + if (this.outboundlinks == null) resortLinks(); + return (this.outboundlinks == null) ? null : this.outboundlinks.keySet(); } public boolean indexingDenied() { @@ -608,7 +634,7 @@ dc_rights String language = this.dc_language(); if (language != null && language.length() > 0) os.write("" + this.dc_language() + "\n"); os.write("" + ISO8601Formatter.FORMATTER.format(date) + "\n"); - if (this.lon != 0.0f && this.lat != 0.0f) os.write("" + this.lon +"" + this.lat + "\n"); + if (this.lon != 0.0f && this.lat != 0.0f) os.write("" + this.lon +"" + this.lat + "\n"); os.write("\n"); } @@ -665,7 +691,7 @@ dc_rights final StringBuilder description = new StringBuilder(80); final LinkedList sectionTitles = new LinkedList(); - final Map anchors = new HashMap(); + final Map anchors = new HashMap(); final Map rss = new HashMap(); final Map images = new HashMap(); float lon = 0.0f, lat = 0.0f; @@ -716,6 +742,7 @@ dc_rights globalMime, null, null, + null, subjects.toString().split(" |,"), title.toString(), authors.toString(), diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index ce3bbfc7a..85b00b6e5 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -267,6 +267,7 @@ public class DCEntry extends TreeMap { getIdentifier(true), "text/html", "UTF-8", + this, languages, getSubject(), getTitle(), diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index a3535a3cc..bb8a71030 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -64,6 +64,7 @@ public class csvParser extends AbstractParser implements Parser { location, mimeType, charset, + this, null, null, concatRow(table.get(0)), diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 1fc3a9288..d666132b1 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -88,6 +88,7 @@ public class docParser extends AbstractParser implements Parser { location, mimeType, "UTF-8", + this, null, null, title, diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index a96d468f0..efa979d9a 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -50,6 +50,7 @@ public class genericParser extends AbstractParser implements Parser { location, mimeType, charset, + this, null, null, location.getFileName().length() == 0 ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()), // title diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 889e85523..52a6502b3 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -70,6 +70,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { linkTags0.add("meta"); linkTags0.add("area"); linkTags0.add("link"); + linkTags0.add("script"); linkTags0.add("embed"); //added by [MN] linkTags0.add("param"); //added by [MN] @@ -78,17 +79,27 @@ public class ContentScraper extends AbstractScraper implements Scraper { linkTags1.add("h2"); linkTags1.add("h3"); linkTags1.add("h4"); + linkTags1.add("h5"); + linkTags1.add("h6"); linkTags1.add("title"); + linkTags1.add("b"); + linkTags1.add("strong"); + linkTags1.add("i"); + linkTags1.add("li"); + linkTags1.add("iframe"); + //