diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java index dece70730..f36fd4b57 100644 --- a/htroot/IndexImportMediawiki_p.java +++ b/htroot/IndexImportMediawiki_p.java @@ -54,11 +54,16 @@ public class IndexImportMediawiki_p { } else { if (post.containsKey("file")) { final File sourcefile = new File(post.get("file")); - MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); - MediawikiImporter.job.start(); + if (sourcefile.exists()) { + MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); + MediawikiImporter.job.start(); + prop.put("import_dump", MediawikiImporter.job.source()); + prop.put("import_thread", "started"); + } else { + prop.put("import_dump", ""); + prop.put("import_thread", "Error: file not found ["+sourcefile+"]"); + } prop.put("import", 1); - prop.put("import_thread", "started"); - prop.put("import_dump", MediawikiImporter.job.source()); prop.put("import_count", 0); prop.put("import_speed", 0); prop.put("import_runningHours", 0); @@ -66,7 +71,6 @@ public class IndexImportMediawiki_p { prop.put("import_remainingHours", 0); prop.put("import_remainingMinutes", 0); } - return prop; } } return prop; diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 7752f4f30..ed3f45596 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; -import net.yacy.document.parser.html.CharacterCoding; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable 0) ? 1 : 0); } - private void escapeAnchor() { - this.anchor = escape(this.anchor).toString(); - } - private void escapeSearchpart() { final String[] questp = CommonPattern.AMP.split(this.searchpart, -1); final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); @@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') { + sbuf.append((char)ch); // lets consider this is used for encoding, leave it that way + } else { + sbuf.append("%23"); // RFC 1738 2.2 unsafe char shall be encoded + } + } else if (ch == '&') { + if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) { + sbuf.append((char)ch); // leave it that way, it is used the right way + } else { + sbuf.append("&"); // this must be urlencoded + } sbuf.append((char)ch); - } else if ('0' <= ch && ch <= '9') { // '0'..'9' + } else if (ch == '#') { // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding sbuf.append((char)ch); - } else if (ch == ' ') { // space - sbuf.append("%20"); - } else if (ch == '&' || ch == ':' // unreserved + } else if (ch == '!' || ch == ':' // unreserved || ch == '-' || ch == '_' - || ch == '.' || ch == '!' - || ch == '~' || ch == '*' - || ch == '\'' || ch == '(' - || ch == ')' || ch == ';' - || ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded) + || ch == '.' || ch == '~' + || ch == '*' || ch == '\'' + || ch == '(' || ch == ')' + || ch == '{' || ch == '}' + || ch == ';' || ch == ',' || ch == '=') { // RFC 1738 2.2 unsafe char (may be used unencoded) + sbuf.append((char)ch); + } else if ('0' <= ch && ch <= '9') { // '0'..'9' sbuf.append((char)ch); } else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced sbuf.append((char)ch); + } else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' + sbuf.append((char)ch); + } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' + sbuf.append((char)ch); } else if (ch <= 0x007f) { // other ASCII sbuf.append(hex[ch]); } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF @@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable/ may have many '/' if the host is omitted and the path starts with '/' new String[]{null, "file:///bin/yacy2"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:/bin/yacy1"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' new String[]{null, "file:C:WINDOWS\\CMD.EXE"}, new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"}, new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"}, diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 5537b767b..7e2696088 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -173,7 +173,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null; Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null; if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) && - (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) && + (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) && (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) && (httpstatus_i == null || httpstatus_i.intValue() == 200)) { uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] { @@ -190,14 +190,17 @@ public class SchemaConfiguration extends Configuration implements Serializable { continue uniquecheck; } try { - SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); - if (docs.getNumFound() == 0) { - sid.setField(uniquefield.getSolrFieldName(), true); - } else { - boolean firstappearance = true; - for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} - sid.setField(uniquefield.getSolrFieldName(), firstappearance); - } + long doccount = segment.fulltext().getDefaultConnector().getCountByQuery( + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + + "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 + "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 + "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 + "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 + "(-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":[* TO *] OR " + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true ) AND " + + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + + "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\""); + sid.setField(uniquefield.getSolrFieldName(), doccount == 0); } catch (final IOException e) {} } } diff --git a/source/net/yacy/cora/protocol/ResponseHeader.java b/source/net/yacy/cora/protocol/ResponseHeader.java index 92f4fd84b..7c1b3924c 100644 --- a/source/net/yacy/cora/protocol/ResponseHeader.java +++ b/source/net/yacy/cora/protocol/ResponseHeader.java @@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework { if (x_robots_tag.isEmpty()) { x_robots_tag = this.get(HeaderFramework.X_ROBOTS, ""); } - return x_robots_tag; + return x_robots_tag.toLowerCase(); } + } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 040227428..fe3a1b0c2 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -90,7 +90,7 @@ public class Document { private MultiProtocolURL favicon; private boolean resorted; private final Set languages; - private final boolean indexingDenied; + private boolean indexingDenied; private final double lon, lat; private final Object parserObject; // the source object that was used to create the Document private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document @@ -733,6 +733,10 @@ dc_rights return this.indexingDenied; } + public void setIndexingDenied(boolean indexingDenied) { + this.indexingDenied = indexingDenied; + } + public void setDepth(int depth) { this.crawldepth = depth; } @@ -819,6 +823,7 @@ dc_rights final LinkedHashMap images = new LinkedHashMap(); final Set languages = new HashSet(); double lon = 0.0d, lat = 0.0d; + boolean indexingDenied = false; Date date = new Date(); String charset = null; @@ -867,6 +872,8 @@ dc_rights if (doc.getDepth() < mindepth) mindepth = doc.getDepth(); if (doc.dc_language() != null) languages.add(doc.dc_language()); + + indexingDenied |= doc.indexingDenied; } // clean up parser data @@ -898,7 +905,7 @@ dc_rights anchors, rss, images, - false, + indexingDenied, date); newDoc.setDepth(mindepth); return newDoc; diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java index f93300cbd..7541e22e1 100644 --- a/source/net/yacy/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -312,6 +312,13 @@ public final class CharacterCoding { } s = text.substring(p, q + 1); p = q + 1; + // check if another ampersand is in between + int pp; + while ((pp = s.indexOf('&', 1)) >= 0) { + // we skip the first ampersand + sb.append(s.substring(0, pp)); + s = s.substring(pp); + } if (s.equals(AMP_HTML)) { sb.append(AMP_UNICODE); continue; @@ -340,7 +347,8 @@ public final class CharacterCoding { } catch (final NumberFormatException e) { } continue; } - // the entity is unknown, skip it + // the entity is unknown, copy it + sb.append(s); } return sb.toString(); } diff --git a/source/net/yacy/http/AbstractRemoteHandler.java b/source/net/yacy/http/AbstractRemoteHandler.java index 776e8362c..4e779ec4f 100644 --- a/source/net/yacy/http/AbstractRemoteHandler.java +++ b/source/net/yacy/http/AbstractRemoteHandler.java @@ -49,7 +49,7 @@ import org.eclipse.jetty.server.Request; */ abstract public class AbstractRemoteHandler extends ConnectHandler implements Handler { - protected Switchboard sb = null; + protected Switchboard sb = null; private List localVirtualHostNames; // list for quick check for req to local peer @Override @@ -66,6 +66,7 @@ abstract public class AbstractRemoteHandler extends ConnectHandler implements Ha if (localInetAddress != null) { if (!localVirtualHostNames.contains(localInetAddress.getHostName())) { localVirtualHostNames.add(localInetAddress.getHostName()); + localVirtualHostNames.add(localInetAddress.getHostAddress()); // same as getServer().getURI().getHost() } if (!localVirtualHostNames.contains(localInetAddress.getCanonicalHostName())) { diff --git a/source/net/yacy/http/servlets/YaCyDefaultServlet.java b/source/net/yacy/http/servlets/YaCyDefaultServlet.java index 9b9703fbc..7eb2eb16d 100644 --- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java +++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java @@ -895,7 +895,7 @@ public class YaCyDefaultServlet extends HttpServlet { // add the application version, the uptime and the client name to every rewrite table templatePatterns.put(servletProperties.PEER_STAT_VERSION, yacyBuildProperties.getVersion()); - templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - serverCore.startupTime) / 1000) / 60); // uptime in minutes + templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - sb.startupTime) / 1000) / 60); // uptime in minutes templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTNAME, sb.peers.mySeed().getName()); templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTID, sb.peers.myID()); templatePatterns.put(servletProperties.PEER_STAT_MYTIME, GenericFormatter.SHORT_SECOND_FORMATTER.format()); diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java index 305e63bdc..00e90f95a 100644 --- a/source/net/yacy/peers/Network.java +++ b/source/net/yacy/peers/Network.java @@ -89,11 +89,6 @@ public class Network // class variables Switchboard sb; - public static int yacyTime() { - // the time since startup of yacy in seconds - return Math.max(0, (int) ((System.currentTimeMillis() - serverCore.startupTime) / 1000)); - } - public Network(final Switchboard sb) { final long time = System.currentTimeMillis(); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index ca15d13fb..c25cde6d6 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -355,7 +355,14 @@ public final class LoaderDispatcher { if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); // parse resource - return response.parse(); + Document[] documents = response.parse(); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) { + for (Document d: documents) d.setIndexingDenied(true); + } + + return documents; } public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { @@ -371,7 +378,12 @@ public final class LoaderDispatcher { // parse resource try { Document[] documents = response.parse(); - return Document.mergeDocuments(location, response.getMimeType(), documents); + Document merged = Document.mergeDocuments(location, response.getMimeType(), documents); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true); + + return merged; } catch(final Parser.Failure e) { throw new IOException(e.getMessage()); } diff --git a/source/net/yacy/search/IndexAbstracts.java b/source/net/yacy/search/IndexAbstracts.java deleted file mode 100644 index 165d2958c..000000000 --- a/source/net/yacy/search/IndexAbstracts.java +++ /dev/null @@ -1,69 +0,0 @@ -// IndexAbstracts.java -// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 10.10.2005 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.search; - -import java.util.Iterator; -import java.util.Map; -import java.util.TreeMap; - -public class IndexAbstracts extends TreeMap> { - - private static final long serialVersionUID = 3037740969349726216L; - - public IndexAbstracts() { - super(); - } - - public String wordsFromPeer(final String peerhash, final String urls) { - Map.Entry> entry; - String word, peerlist, url, wordlist = ""; - TreeMap urlPeerlist; - int p; - boolean hasURL; - synchronized (this) { - final Iterator>> i = this.entrySet().iterator(); - while (i.hasNext()) { - entry = i.next(); - word = entry.getKey(); - urlPeerlist = entry.getValue(); - hasURL = true; - for (int j = 0; j < urls.length(); j = j + 12) { - url = urls.substring(j, j + 12); - peerlist = urlPeerlist.get(url); - p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); - if ((p < 0) || (p % 12 != 0)) { - hasURL = false; - break; - } - } - if (hasURL) wordlist += word; - } - } - return wordlist; - } - -} \ No newline at end of file diff --git a/source/net/yacy/search/StorageQueueEntry.java b/source/net/yacy/search/StorageQueueEntry.java deleted file mode 100644 index d623a45c4..000000000 --- a/source/net/yacy/search/StorageQueueEntry.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * StorageQueueEntry - * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany - * First released 30.05.2013 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.search; - -import org.apache.solr.common.SolrInputDocument; - -import net.yacy.kelondro.workflow.WorkflowJob; - -public class StorageQueueEntry extends WorkflowJob { - - public SolrInputDocument queueEntry; - - public StorageQueueEntry(final SolrInputDocument queueEntry) { - super(); - this.queueEntry = queueEntry; - } -} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d382caaff..033179693 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -194,7 +194,6 @@ import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.WebgraphConfiguration; -import net.yacy.server.serverCore; import net.yacy.server.serverSwitch; import net.yacy.server.http.RobotsTxtConfig; import net.yacy.utils.CryptoLib; @@ -281,6 +280,7 @@ public final class Switchboard extends serverSwitch { public boolean useTailCache; public boolean exceed134217727; + public final long startupTime = System.currentTimeMillis(); private final Semaphore shutdownSync = new Semaphore(0); private boolean terminate = false; private boolean startupAction = true; // this is set to false after the first event @@ -3607,22 +3607,22 @@ public final class Switchboard extends serverSwitch { } public float averageQPM() { - final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000; + final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000; return (this.searchQueriesRobinsonFromRemote + this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f); } public float averageQPMGlobal() { - final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000; + final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000; return (this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f); } public float averageQPMPrivateLocal() { - final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000; + final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000; return (this.searchQueriesRobinsonFromLocal) * 60f / Math.max(uptime, 1f); } public float averageQPMPublicLocal() { - final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000; + final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000; return (this.searchQueriesRobinsonFromRemote) * 60f / Math.max(uptime, 1f); } @@ -3632,7 +3632,7 @@ public final class Switchboard extends serverSwitch { this.peers.mySeed().put(Seed.PORT, getConfig("port", "8090")); //the speed of indexing (pages/minute) of the peer - final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000; + final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000; Seed mySeed = this.peers.mySeed(); mySeed.put(Seed.ISPEED, Integer.toString(currentPPM())); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 137f7050c..0be3a2e7a 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -130,7 +130,6 @@ public final class SearchEvent { public final List nodeSearchThreads; public Thread[] secondarySearchThreads; public final SortedMap preselectedPeerHashes; - private final Thread localSearchThread; private final SortedMap IACount; private final SortedMap IAResults; private final SortedMap heuristics; @@ -249,7 +248,6 @@ public final class SearchEvent { this.heuristics = new TreeMap(Base64Order.enhancedCoder); this.IAmaxcounthash = null; this.IAneardhthash = null; - this.localSearchThread = null; this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false))); this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering this.local_rwi_stored = new AtomicInteger(0); @@ -650,7 +648,6 @@ public final class SearchEvent { // clear all data structures if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear(); - if (this.localSearchThread != null && this.localSearchThread.isAlive()) this.localSearchThread.interrupt(); if (this.IACount != null) this.IACount.clear(); if (this.IAResults != null) this.IAResults.clear(); if (this.heuristics != null) this.heuristics.clear(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index a81320e30..f000d20da 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -397,13 +397,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); final DigestURL digestURL = document.dc_source(); - final String id = ASCII.String(digestURL.hash()); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL)); Set processTypes = new LinkedHashSet(); String host = digestURL.getHost(); - String us = digestURL.toNormalform(true); int crawldepth = document.getDepth(); if ((allAttr || contains(CollectionSchema.crawldepth_i))) { @@ -562,22 +560,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // bit 15: "noimageindex" contained in http header X-Robots-Tag // bit 16: "unavailable_after" contained in http header X-Robots-Tag int b = 0; - final String robots_meta = html.getMetas().get("robots"); + String robots_meta = html.getMetas().get("robots"); // this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html if (robots_meta != null) { + robots_meta = robots_meta.toLowerCase(); if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2 if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4 } - String x_robots_tag = ""; - if (responseHeader != null) { - x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, ""); - if (x_robots_tag.isEmpty()) { - x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, ""); - } - } + String x_robots_tag = responseHeader.getXRobotsTag(); if (!x_robots_tag.isEmpty()) { // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 @@ -754,14 +747,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } } - if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { + if (canonical != null) { containsCanonical = true; inboundLinks.remove(canonical); outboundLinks.remove(canonical); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); // set a flag if this is equal to sku if (contains(CollectionSchema.canonical_equal_sku_b)) { - add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(us)); + add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL)); } } } diff --git a/source/net/yacy/server/serverCore.java b/source/net/yacy/server/serverCore.java index 5da152543..f02bd4934 100644 --- a/source/net/yacy/server/serverCore.java +++ b/source/net/yacy/server/serverCore.java @@ -37,7 +37,6 @@ public final class serverCore { public static final byte[] CRLF = {CR, LF}; public static final String CRLF_STRING = UTF8.String(CRLF); public static final String LF_STRING = UTF8.String(new byte[]{LF}); - public static final long startupTime = System.currentTimeMillis(); public static boolean useStaticIP = false; diff --git a/source/net/yacy/server/serverSwitchAbstractAction.java b/source/net/yacy/server/serverSwitchAbstractAction.java deleted file mode 100644 index 06881f5ff..000000000 --- a/source/net/yacy/server/serverSwitchAbstractAction.java +++ /dev/null @@ -1,52 +0,0 @@ -// serverSwitchAbstractAction.java -// ------------------------------------- -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 11.05.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.server; - -import net.yacy.cora.util.ConcurrentLog; - -public abstract class serverSwitchAbstractAction { - - protected ConcurrentLog log = null; - private String shortDescr = "", longDescr = ""; - - public void setDescription(final String shortText, final String longText) { - // sets a visible description string - this.shortDescr = shortText; - this.longDescr = longText; - } - - public String getShortDescription() { - // returns short description string for online display - return this.shortDescr; - } - - public String getLongDescription() { - // returns long description string for online display - return this.longDescr; - } - - public void setLog(final ConcurrentLog log) { - // defines a log where process states can be written to - this.log = log; - } - -}