This is a major change in the organization of indexes. Please consider a back-up of your data before you run this update. All existing index files will be moved and renamed to a new position. With this change, it will be possible to maintain different indexes for different purposes and it will be possible to have a distinction between DHT-in and DHT-out specific indexes. Tenants may also have their own index, and it may be possible to have histories and back-ups of indexes. This is just the beginning, many servlets must be adopted after this change, but all functions that had been there should still work. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6389 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
09de5da74a
commit
735e2737e3
@ -0,0 +1,209 @@
|
||||
//RobotsEntry.java
|
||||
//-------------------------------------
|
||||
//part of YACY
|
||||
//(C) by Michael Peter Christen; mc@yacy.net
|
||||
//first published on http://www.anomic.de
|
||||
//Frankfurt, Germany, 2004
|
||||
//
|
||||
//This file is contributed by Martin Thelian
|
||||
// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class
|
||||
// [MC] redesign: removed entry object from RobotsTxt Class into ths separate class
|
||||
|
||||
//last major change: $LastChangedDate$ by $LastChangedBy$
|
||||
//Revision: $LastChangedRevision$
|
||||
//
|
||||
//This program is free software; you can redistribute it and/or modify
|
||||
//it under the terms of the GNU General Public License as published by
|
||||
//the Free Software Foundation; either version 2 of the License, or
|
||||
//(at your option) any later version.
|
||||
//
|
||||
//This program is distributed in the hope that it will be useful,
|
||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
//GNU General Public License for more details.
|
||||
//
|
||||
//You should have received a copy of the GNU General Public License
|
||||
//along with this program; if not, write to the Free Software
|
||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.crawler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
|
||||
public class RobotsEntry {
|
||||
|
||||
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
|
||||
public static final String ALLOW_PATH_LIST = "allow";
|
||||
public static final String DISALLOW_PATH_LIST = "disallow";
|
||||
public static final String LOADED_DATE = "date";
|
||||
public static final String MOD_DATE = "modDate";
|
||||
public static final String ETAG = "etag";
|
||||
public static final String SITEMAP = "sitemap";
|
||||
public static final String CRAWL_DELAY = "crawlDelay";
|
||||
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
|
||||
|
||||
// this is a simple record structure that holds all properties of a single crawl start
|
||||
Map<String, String> mem;
|
||||
private LinkedList<String> allowPathList, denyPathList;
|
||||
String hostName;
|
||||
|
||||
public RobotsEntry(final String hostName, final Map<String, String> mem) {
|
||||
this.hostName = hostName.toLowerCase();
|
||||
this.mem = mem;
|
||||
|
||||
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
|
||||
this.denyPathList = new LinkedList<String>();
|
||||
final String csPl = this.mem.get(DISALLOW_PATH_LIST);
|
||||
if (csPl.length() > 0){
|
||||
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
|
||||
if ((pathArray != null)&&(pathArray.length > 0)) {
|
||||
this.denyPathList.addAll(Arrays.asList(pathArray));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.denyPathList = new LinkedList<String>();
|
||||
}
|
||||
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
|
||||
this.allowPathList = new LinkedList<String>();
|
||||
final String csPl = this.mem.get(ALLOW_PATH_LIST);
|
||||
if (csPl.length() > 0){
|
||||
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
|
||||
if ((pathArray != null)&&(pathArray.length > 0)) {
|
||||
this.allowPathList.addAll(Arrays.asList(pathArray));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.allowPathList = new LinkedList<String>();
|
||||
}
|
||||
}
|
||||
|
||||
public RobotsEntry(
|
||||
final String hostName,
|
||||
final ArrayList<String> allowPathList,
|
||||
final ArrayList<String> disallowPathList,
|
||||
final Date loadedDate,
|
||||
final Date modDate,
|
||||
final String eTag,
|
||||
final String sitemap,
|
||||
final long crawlDelayMillis
|
||||
) {
|
||||
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
|
||||
|
||||
this.hostName = hostName.trim().toLowerCase();
|
||||
this.allowPathList = new LinkedList<String>();
|
||||
this.denyPathList = new LinkedList<String>();
|
||||
|
||||
this.mem = new HashMap<String, String>(5);
|
||||
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
|
||||
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
|
||||
if (eTag != null) this.mem.put(ETAG,eTag);
|
||||
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
|
||||
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis));
|
||||
|
||||
if ((allowPathList != null)&&(allowPathList.size()>0)) {
|
||||
this.allowPathList.addAll(allowPathList);
|
||||
|
||||
final StringBuilder pathListStr = new StringBuilder();
|
||||
for (int i=0; i<allowPathList.size();i++) {
|
||||
pathListStr.append(allowPathList.get(i))
|
||||
.append(ROBOTS_DB_PATH_SEPARATOR);
|
||||
}
|
||||
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
|
||||
}
|
||||
|
||||
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
|
||||
this.denyPathList.addAll(disallowPathList);
|
||||
|
||||
final StringBuilder pathListStr = new StringBuilder();
|
||||
for (int i=0; i<disallowPathList.size();i++) {
|
||||
pathListStr.append(disallowPathList.get(i))
|
||||
.append(ROBOTS_DB_PATH_SEPARATOR);
|
||||
}
|
||||
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
final StringBuilder str = new StringBuilder();
|
||||
str.append((this.hostName==null)?"null":this.hostName)
|
||||
.append(": ");
|
||||
|
||||
if (this.mem != null) {
|
||||
str.append(this.mem.toString());
|
||||
}
|
||||
|
||||
return str.toString();
|
||||
}
|
||||
|
||||
public String getSitemap() {
|
||||
return this.mem.containsKey(SITEMAP)? this.mem.get(SITEMAP): null;
|
||||
}
|
||||
|
||||
public Date getLoadedDate() {
|
||||
if (this.mem.containsKey(LOADED_DATE)) {
|
||||
return new Date(Long.valueOf(this.mem.get(LOADED_DATE)).longValue());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void setLoadedDate(final Date newLoadedDate) {
|
||||
if (newLoadedDate != null) {
|
||||
this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime()));
|
||||
}
|
||||
}
|
||||
|
||||
public Date getModDate() {
|
||||
if (this.mem.containsKey(MOD_DATE)) {
|
||||
return new Date(Long.valueOf(this.mem.get(MOD_DATE)).longValue());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String getETag() {
|
||||
if (this.mem.containsKey(ETAG)) {
|
||||
return this.mem.get(ETAG);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public long getCrawlDelayMillis() {
|
||||
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
|
||||
return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS));
|
||||
} catch (final NumberFormatException e) {
|
||||
return 0;
|
||||
}
|
||||
if (this.mem.containsKey(CRAWL_DELAY)) try {
|
||||
return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY));
|
||||
} catch (final NumberFormatException e) {
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public boolean isDisallowed(String path) {
|
||||
if ((this.mem == null) || (this.denyPathList.size() == 0)) return false;
|
||||
|
||||
// if the path is null or empty we set it to /
|
||||
if ((path == null) || (path.length() == 0)) path = "/";
|
||||
// escaping all occurences of ; because this char is used as special char in the Robots DB
|
||||
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
|
||||
|
||||
final Iterator<String> pathIter = this.denyPathList.iterator();
|
||||
while (pathIter.hasNext()) {
|
||||
final String nextPath = pathIter.next();
|
||||
|
||||
// disallow rule
|
||||
if (path.startsWith(nextPath)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,226 @@
|
||||
// Segments.java
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 30.07.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
|
||||
// $LastChangedRevision: 5988 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.document.Condenser;
|
||||
import de.anomic.document.Document;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
import de.anomic.yacy.logging.Log;
|
||||
|
||||
public final class Segments implements Iterable<Segment> {
|
||||
|
||||
/**
|
||||
* process enumeration type
|
||||
* defines constants that can be used to assign process-related segment names
|
||||
*/
|
||||
public enum Process {
|
||||
|
||||
RECEIPTS,
|
||||
QUERIES,
|
||||
DHTIN,
|
||||
DHTOUT, // the only segment that is used for reading-only
|
||||
PROXY,
|
||||
LOCALCRAWLING,
|
||||
REMOTECRAWLING,
|
||||
PUBLIC; // includes the index that can be retrieved by the yacy p2p api
|
||||
|
||||
public String toString() {
|
||||
throw new UnsupportedOperationException("toString not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
private final Log log;
|
||||
private final File segmentsPath;
|
||||
private final int entityCacheMaxSize;
|
||||
private final long maxFileSize;
|
||||
private HashMap<String, Segment> segments;
|
||||
private HashMap<Process, String> process_assignment;
|
||||
private final boolean useTailCache;
|
||||
private final boolean exceed134217727;
|
||||
|
||||
public Segments(
|
||||
final Log log,
|
||||
final File segmentsPath,
|
||||
final int entityCacheMaxSize,
|
||||
final long maxFileSize,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727) throws IOException {
|
||||
this.log = log;
|
||||
this.segmentsPath = segmentsPath;
|
||||
this.entityCacheMaxSize = entityCacheMaxSize;
|
||||
this.maxFileSize = maxFileSize;
|
||||
this.useTailCache = useTailCache;
|
||||
this.exceed134217727 = exceed134217727;
|
||||
this.segments = new HashMap<String, Segment>();
|
||||
this.process_assignment = new HashMap<Process, String>();
|
||||
|
||||
// assign default segment names for the processes
|
||||
this.process_assignment.put(Process.RECEIPTS, "default");
|
||||
this.process_assignment.put(Process.QUERIES, "default");
|
||||
this.process_assignment.put(Process.DHTIN, "default");
|
||||
this.process_assignment.put(Process.DHTOUT, "default");
|
||||
this.process_assignment.put(Process.PROXY, "default");
|
||||
this.process_assignment.put(Process.LOCALCRAWLING, "default");
|
||||
this.process_assignment.put(Process.REMOTECRAWLING, "default");
|
||||
this.process_assignment.put(Process.PUBLIC, "default");
|
||||
}
|
||||
|
||||
public void setSegment(Process process, String segmentName) {
|
||||
this.process_assignment.put(process, segmentName);
|
||||
}
|
||||
|
||||
public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) {
|
||||
if (!oldSingleSegment.exists()) return;
|
||||
File newSegmentPath = new File(newSegmentsPath, newSegmentName);
|
||||
if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
|
||||
Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
|
||||
Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
|
||||
|
||||
String[] oldFiles = oldSingleSegment.list();
|
||||
for (String oldFile: oldFiles) {
|
||||
if (oldFile.startsWith("text.")) {
|
||||
new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String[] segmentNames() {
|
||||
return this.segments.keySet().toArray(new String[this.segments.size()]);
|
||||
}
|
||||
|
||||
public boolean segmentExist(final String segmentName) {
|
||||
return segments.containsKey(segmentName);
|
||||
}
|
||||
|
||||
public Segment segment(final Process process) {
|
||||
return segment(this.process_assignment.get(process));
|
||||
}
|
||||
|
||||
public Segment segment(final String segmentName) {
|
||||
Segment segment = segments.get(segmentName);
|
||||
if (segment == null) {
|
||||
// generate the segment
|
||||
try {
|
||||
segment = new Segment(
|
||||
this.log,
|
||||
new File(this.segmentsPath, segmentName),
|
||||
this.entityCacheMaxSize,
|
||||
this.maxFileSize,
|
||||
this.useTailCache,
|
||||
this.exceed134217727);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
this.segments.put(segmentName, segment);
|
||||
}
|
||||
return segment;
|
||||
}
|
||||
|
||||
public int URLCount() {
|
||||
int c = 0;
|
||||
for (Segment s: this.segments.values()) c += s.urlMetadata().size();
|
||||
return c;
|
||||
}
|
||||
|
||||
public int RWICount() {
|
||||
int c = 0;
|
||||
for (Segment s: this.segments.values()) c += s.termIndex().sizesMax();
|
||||
return c;
|
||||
}
|
||||
|
||||
public int RWIBufferCount() {
|
||||
int c = 0;
|
||||
for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
|
||||
return c;
|
||||
}
|
||||
|
||||
public MetadataRepository urlMetadata(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).urlMetadata();
|
||||
}
|
||||
|
||||
public IndexCell<WordReference> termIndex(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).termIndex();
|
||||
}
|
||||
|
||||
public void clear(final Process process) {
|
||||
segment(this.process_assignment.get(process)).clear();
|
||||
}
|
||||
|
||||
public File getLocation(final Process process) {
|
||||
return segment(this.process_assignment.get(process)).getLocation();
|
||||
}
|
||||
|
||||
public void close(final Process process) {
|
||||
segment(this.process_assignment.get(process)).close();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (segments != null) for (Segment s: this.segments.values()) s.close();
|
||||
this.segments = null;
|
||||
}
|
||||
|
||||
public void finalize() {
|
||||
this.close();
|
||||
}
|
||||
|
||||
public URLMetadataRow storeDocument(
|
||||
final String segmentName,
|
||||
final yacyURL url,
|
||||
final yacyURL referrerURL,
|
||||
final Date docDate,
|
||||
final long sourcesize,
|
||||
final Document document,
|
||||
final Condenser condenser
|
||||
) throws IOException {
|
||||
return segment(segmentName).storeDocument(
|
||||
url,
|
||||
referrerURL,
|
||||
docDate,
|
||||
sourcesize,
|
||||
document,
|
||||
condenser
|
||||
);
|
||||
}
|
||||
|
||||
public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) throws IOException {
|
||||
return segment(segmentName).getReferenceCleaner(startHash);
|
||||
}
|
||||
|
||||
public Iterator<Segment> iterator() {
|
||||
return this.segments.values().iterator();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in new issue