Merge branch 'master' of git@gitorious.org:yacy/rc1.git

pull/1/head
orbiter 11 years ago
commit dab9a0786a

@ -54,11 +54,16 @@ public class IndexImportMediawiki_p {
} else {
if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file"));
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
if (sourcefile.exists()) {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", "started");
} else {
prop.put("import_dump", "");
prop.put("import_thread", "Error: file not found ["+sourcefile+"]");
}
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
@ -66,7 +71,6 @@ public class IndexImportMediawiki_p {
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
}
return prop;
}
}
return prop;

@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.parser.html.CharacterCoding;
/**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (h.startsWith("///")) { //absolute local file path
// no host given
this.path = h.substring(2); // "/path" or "/c:/path"
} else { // "//host/path" or "//host/c:/path"
} else if (h.startsWith("//")) { // "//host/path" or "//host/c:/path"
int q = url.indexOf('/', p + 3);
if (q < 0) {
this.path = "/";
} else {
this.path = url.substring(q);
}
} else if (h.startsWith("/")) { // "/host/path" or "/host/c:/path"
this.path = h;
}
this.userInfo = null;
this.port = -1;
@ -418,7 +419,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
private void escape() {
if (this.path != null && this.path.indexOf('%') == -1) escapePath();
if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
if (this.anchor != null && this.anchor.indexOf('%') == -1) escapeAnchor();
if (this.anchor != null) this.anchor = escape(this.anchor).toString();
}
private void escapePath() {
@ -431,10 +432,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
}
private void escapeAnchor() {
this.anchor = escape(this.anchor).toString();
}
private void escapeSearchpart() {
final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '%') {
if (i < len - 2 && s.charAt(i + 1) >= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') {
sbuf.append((char)ch); // lets consider this is used for encoding, leave it that way
} else {
sbuf.append("%23"); // RFC 1738 2.2 unsafe char shall be encoded
}
} else if (ch == '&') {
if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) {
sbuf.append((char)ch); // leave it that way, it is used the right way
} else {
sbuf.append("&amp;"); // this must be urlencoded
}
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
} else if (ch == '#') { // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '&' || ch == ':' // unreserved
} else if (ch == '!' || ch == ':' // unreserved
|| ch == '-' || ch == '_'
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')' || ch == ';'
|| ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded)
|| ch == '.' || ch == '~'
|| ch == '*' || ch == '\''
|| ch == '(' || ch == ')'
|| ch == '{' || ch == '}'
|| ch == ';' || ch == ',' || ch == '=') { // RFC 1738 2.2 unsafe char (may be used unencoded)
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced
sbuf.append((char)ch);
} else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if (ch <= 0x007f) { // other ASCII
sbuf.append(hex[ch]);
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else {
this.searchpart = this.path.substring(r + 1);
// strip &amp;
/*
Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
while (matcher.find()) {
int from = 0;
while (matcher.find(from)) {
from = matcher.start() + 1;
this.searchpart = matcher.replaceAll("&");
matcher.reset(this.searchpart);
}
*/
this.path = this.path.substring(0, r);
}
}
@ -934,7 +950,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
String urlPath = this.getFile(excludeAnchor, removeSessionID);
String h = getHost();
final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
final StringBuilder u = new StringBuilder(20 + (urlPath == null ? 0 : urlPath.length()) + ((h == null) ? 0 : h.length()));
u.append(this.protocol);
u.append("://");
if (h != null) {
@ -2179,10 +2195,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
*/
public static void main(final String[] args) {
final String[][] test = new String[][]{
new String[]{null, "https://www.example.com/shoe/?p=2&ps=75#t={%22san_NaviPaging%22:2}"}, // ugly strange pagination link
new String[]{null, "C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},

@ -173,7 +173,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) &&
(robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) &&
(robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) &&
(canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) &&
(httpstatus_i == null || httpstatus_i.intValue() == 200)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] {
@ -190,14 +190,17 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck;
}
try {
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName());
if (docs.getNumFound() == 0) {
sid.setField(uniquefield.getSolrFieldName(), true);
} else {
boolean firstappearance = true;
for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;}
sid.setField(uniquefield.getSolrFieldName(), firstappearance);
}
long doccount = segment.fulltext().getDefaultConnector().getCountByQuery(
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
"(-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":[* TO *] OR " + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true ) AND " +
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
} catch (final IOException e) {}
}
}

@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
if (x_robots_tag.isEmpty()) {
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
}
return x_robots_tag;
return x_robots_tag.toLowerCase();
}
}

@ -90,7 +90,7 @@ public class Document {
private MultiProtocolURL favicon;
private boolean resorted;
private final Set<String> languages;
private final boolean indexingDenied;
private boolean indexingDenied;
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
@ -733,6 +733,10 @@ dc_rights
return this.indexingDenied;
}
public void setIndexingDenied(boolean indexingDenied) {
this.indexingDenied = indexingDenied;
}
public void setDepth(int depth) {
this.crawldepth = depth;
}
@ -819,6 +823,7 @@ dc_rights
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>();
double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false;
Date date = new Date();
String charset = null;
@ -867,6 +872,8 @@ dc_rights
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
if (doc.dc_language() != null) languages.add(doc.dc_language());
indexingDenied |= doc.indexingDenied;
}
// clean up parser data
@ -898,7 +905,7 @@ dc_rights
anchors,
rss,
images,
false,
indexingDenied,
date);
newDoc.setDepth(mindepth);
return newDoc;

@ -312,6 +312,13 @@ public final class CharacterCoding {
}
s = text.substring(p, q + 1);
p = q + 1;
// check if another ampersand is in between
int pp;
while ((pp = s.indexOf('&', 1)) >= 0) {
// we skip the first ampersand
sb.append(s.substring(0, pp));
s = s.substring(pp);
}
if (s.equals(AMP_HTML)) {
sb.append(AMP_UNICODE);
continue;
@ -340,7 +347,8 @@ public final class CharacterCoding {
} catch (final NumberFormatException e) { }
continue;
}
// the entity is unknown, skip it
// the entity is unknown, copy it
sb.append(s);
}
return sb.toString();
}

@ -49,7 +49,7 @@ import org.eclipse.jetty.server.Request;
*/
abstract public class AbstractRemoteHandler extends ConnectHandler implements Handler {
protected Switchboard sb = null;
protected Switchboard sb = null;
private List<String> localVirtualHostNames; // list for quick check for req to local peer
@Override
@ -66,6 +66,7 @@ abstract public class AbstractRemoteHandler extends ConnectHandler implements Ha
if (localInetAddress != null) {
if (!localVirtualHostNames.contains(localInetAddress.getHostName())) {
localVirtualHostNames.add(localInetAddress.getHostName());
localVirtualHostNames.add(localInetAddress.getHostAddress()); // same as getServer().getURI().getHost()
}
if (!localVirtualHostNames.contains(localInetAddress.getCanonicalHostName())) {

@ -895,7 +895,7 @@ public class YaCyDefaultServlet extends HttpServlet {
// add the application version, the uptime and the client name to every rewrite table
templatePatterns.put(servletProperties.PEER_STAT_VERSION, yacyBuildProperties.getVersion());
templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - serverCore.startupTime) / 1000) / 60); // uptime in minutes
templatePatterns.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - sb.startupTime) / 1000) / 60); // uptime in minutes
templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTNAME, sb.peers.mySeed().getName());
templatePatterns.putHTML(servletProperties.PEER_STAT_CLIENTID, sb.peers.myID());
templatePatterns.put(servletProperties.PEER_STAT_MYTIME, GenericFormatter.SHORT_SECOND_FORMATTER.format());

@ -89,11 +89,6 @@ public class Network
// class variables
Switchboard sb;
public static int yacyTime() {
// the time since startup of yacy in seconds
return Math.max(0, (int) ((System.currentTimeMillis() - serverCore.startupTime) / 1000));
}
public Network(final Switchboard sb) {
final long time = System.currentTimeMillis();

@ -355,7 +355,14 @@ public final class LoaderDispatcher {
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource
return response.parse();
Document[] documents = response.parse();
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) {
for (Document d: documents) d.setIndexingDenied(true);
}
return documents;
}
public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -371,7 +378,12 @@ public final class LoaderDispatcher {
// parse resource
try {
Document[] documents = response.parse();
return Document.mergeDocuments(location, response.getMimeType(), documents);
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
return merged;
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
}

@ -1,69 +0,0 @@
// IndexAbstracts.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class IndexAbstracts extends TreeMap<String, TreeMap<String, String>> {
private static final long serialVersionUID = 3037740969349726216L;
public IndexAbstracts() {
super();
}
public String wordsFromPeer(final String peerhash, final String urls) {
Map.Entry<String, TreeMap<String, String>> entry;
String word, peerlist, url, wordlist = "";
TreeMap<String, String> urlPeerlist;
int p;
boolean hasURL;
synchronized (this) {
final Iterator<Map.Entry <String, TreeMap<String, String>>> i = this.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
word = entry.getKey();
urlPeerlist = entry.getValue();
hasURL = true;
for (int j = 0; j < urls.length(); j = j + 12) {
url = urls.substring(j, j + 12);
peerlist = urlPeerlist.get(url);
p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash);
if ((p < 0) || (p % 12 != 0)) {
hasURL = false;
break;
}
}
if (hasURL) wordlist += word;
}
}
return wordlist;
}
}

@ -1,35 +0,0 @@
/**
* StorageQueueEntry
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 30.05.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.kelondro.workflow.WorkflowJob;
public class StorageQueueEntry extends WorkflowJob {
public SolrInputDocument queueEntry;
public StorageQueueEntry(final SolrInputDocument queueEntry) {
super();
this.queueEntry = queueEntry;
}
}

@ -194,7 +194,6 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.server.serverCore;
import net.yacy.server.serverSwitch;
import net.yacy.server.http.RobotsTxtConfig;
import net.yacy.utils.CryptoLib;
@ -281,6 +280,7 @@ public final class Switchboard extends serverSwitch {
public boolean useTailCache;
public boolean exceed134217727;
public final long startupTime = System.currentTimeMillis();
private final Semaphore shutdownSync = new Semaphore(0);
private boolean terminate = false;
private boolean startupAction = true; // this is set to false after the first event
@ -3607,22 +3607,22 @@ public final class Switchboard extends serverSwitch {
}
public float averageQPM() {
final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
return (this.searchQueriesRobinsonFromRemote + this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f);
}
public float averageQPMGlobal() {
final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
return (this.searchQueriesGlobal) * 60f / Math.max(uptime, 1f);
}
public float averageQPMPrivateLocal() {
final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
return (this.searchQueriesRobinsonFromLocal) * 60f / Math.max(uptime, 1f);
}
public float averageQPMPublicLocal() {
final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
return (this.searchQueriesRobinsonFromRemote) * 60f / Math.max(uptime, 1f);
}
@ -3632,7 +3632,7 @@ public final class Switchboard extends serverSwitch {
this.peers.mySeed().put(Seed.PORT, getConfig("port", "8090"));
//the speed of indexing (pages/minute) of the peer
final long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
final long uptime = (System.currentTimeMillis() - this.startupTime) / 1000;
Seed mySeed = this.peers.mySeed();
mySeed.put(Seed.ISPEED, Integer.toString(currentPPM()));

@ -130,7 +130,6 @@ public final class SearchEvent {
public final List<Thread> nodeSearchThreads;
public Thread[] secondarySearchThreads;
public final SortedMap<byte[], String> preselectedPeerHashes;
private final Thread localSearchThread;
private final SortedMap<byte[], Integer> IACount;
private final SortedMap<byte[], String> IAResults;
private final SortedMap<byte[], HeuristicResult> heuristics;
@ -249,7 +248,6 @@ public final class SearchEvent {
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localSearchThread = null;
this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering
this.local_rwi_stored = new AtomicInteger(0);
@ -650,7 +648,6 @@ public final class SearchEvent {
// clear all data structures
if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
if (this.localSearchThread != null && this.localSearchThread.isAlive()) this.localSearchThread.interrupt();
if (this.IACount != null) this.IACount.clear();
if (this.IAResults != null) this.IAResults.clear();
if (this.heuristics != null) this.heuristics.clear();

@ -397,13 +397,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURL digestURL = document.dc_source();
final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
String host = digestURL.getHost();
String us = digestURL.toNormalform(true);
int crawldepth = document.getDepth();
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
@ -562,22 +560,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
int b = 0;
final String robots_meta = html.getMetas().get("robots");
String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html
if (robots_meta != null) {
robots_meta = robots_meta.toLowerCase();
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
}
String x_robots_tag = "";
if (responseHeader != null) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
}
}
String x_robots_tag = responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) {
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
@ -754,14 +747,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
}
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
if (canonical != null) {
containsCanonical = true;
inboundLinks.remove(canonical);
outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b)) {
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(us));
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL));
}
}
}

@ -37,7 +37,6 @@ public final class serverCore {
public static final byte[] CRLF = {CR, LF};
public static final String CRLF_STRING = UTF8.String(CRLF);
public static final String LF_STRING = UTF8.String(new byte[]{LF});
public static final long startupTime = System.currentTimeMillis();
public static boolean useStaticIP = false;

@ -1,52 +0,0 @@
// serverSwitchAbstractAction.java
// -------------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 11.05.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.server;
import net.yacy.cora.util.ConcurrentLog;
public abstract class serverSwitchAbstractAction {
protected ConcurrentLog log = null;
private String shortDescr = "", longDescr = "";
public void setDescription(final String shortText, final String longText) {
// sets a visible description string
this.shortDescr = shortText;
this.longDescr = longText;
}
public String getShortDescription() {
// returns short description string for online display
return this.shortDescr;
}
public String getLongDescription() {
// returns long description string for online display
return this.longDescr;
}
public void setLog(final ConcurrentLog log) {
// defines a log where process states can be written to
this.log = log;
}
}
Loading…
Cancel
Save