- modified, simplified old kelondroHTCache object; I believe it should be replaced by something completely new

- removed tree data type in kelondroHTCache
- added new class kelondroHeap; may be the core for a storage object that will once replace the many-files strategy of kelondroHTCache
- removed compatibility mode in indexRAMRI


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4747 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d3715e02ae
commit 32b5b057b9

@ -206,25 +206,6 @@ proxyCacheSize__pro = 1024
# storage place for new releases
releases = DATA/RELEASE
# use the mostly direct mapping of URLs to Filenames
# makes it easy watching the content of the cache using file browsers
# problems arise when a file already exists where a new entry expects a directory
# or vice versa.
# when set to false, the file names are set to the hash of the URL and the
# directory is build from protokoll, hostname and port, as with the old
# layout.
# the advantage of this scheme is that no directory/file collisions can
# occurr.
# switching this flag will take effect after a restart of yacy.
# files that are present under the previously used layout will be renamed
# to the new location and thus be accessible immediately. so an accumulated
# cache is still usable after the switch.
# possible values are {tree, hash}
proxyCacheLayout = hash
# the migration flag shows, if the different layout shall be migrated from one to another
proxyCacheMigration = true
# the following mime-types are the whitelist for indexing
#
# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser

@ -42,19 +42,12 @@ import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBufferedRA;
import de.anomic.kelondro.kelondroByteOrder;
import de.anomic.kelondro.kelondroBytesLongMap;
import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFixedWidthArray;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.kelondro.kelondroRow.EntryIndex;
import de.anomic.server.serverMemory;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class indexContainerHeap {
@ -154,7 +147,7 @@ public final class indexContainerHeap {
synchronized (index) {
is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024));
// dont test available() here because this does not work for files > 2GB
// don't test available() here because this does not work for files > 2GB
loop: while (true) {
// remember seek position
seek0 = seek;
@ -476,70 +469,4 @@ public final class indexContainerHeap {
cache.put(wordHash, container);
}
/**
* this is a compatibility method for a old heap dump format. don't use it if not necessary
* @param indexArrayFile
* @throws IOException
*/
public void restoreArray(File indexArrayFile) throws IOException {
// is only here to read old array data structures
if (!(indexArrayFile.exists())) return;
this.readOnlyMode = false;
kelondroFixedWidthArray dumpArray;
kelondroBufferedRA readBuffer = null;
kelondroRow bufferStructureBasis = new kelondroRow(
"byte[] wordhash-" + yacySeedDB.commonHashLength + ", " +
"Cardinal occ-4 {b256}, " +
"Cardinal time-8 {b256}, " +
"byte[] urlprops-" + payloadrow.objectsize,
kelondroBase64Order.enhancedCoder, 0);
dumpArray = new kelondroFixedWidthArray(indexArrayFile, bufferStructureBasis, 0);
log.logInfo("started restore of ram cache '" + indexArrayFile.getName() + "', " + dumpArray.size() + " word/URL relations");
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
long urlCount = 0, urlsPerSecond = 0;
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, indexContainer>(new kelondroByteOrder.StringOrder(payloadrow.getOrdering())));
try {
Iterator<EntryIndex> i = dumpArray.contentRows(-1);
String wordHash;
//long creationTime;
indexRWIRowEntry wordEntry;
kelondroRow.EntryIndex row;
while (i.hasNext()) {
// get out one entry
row = i.next();
if ((row == null) || (row.empty(0)) || (row.empty(3))) continue;
wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new indexRWIRowEntry(row.getColBytes(3));
// store to cache
indexContainer container = cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, payloadrow, 1);
container.put(wordEntry);
cache.put(wordHash, container);
urlCount++;
// protect against memory shortage
//while (serverMemory.free() < 1000000) {flushFromMem(); java.lang.System.gc();}
// write a log
if (System.currentTimeMillis() > messageTime) {
serverMemory.gc(1000, "indexRAMRI, for better statistic-2"); // for better statistic - thq
urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpArray.size() - urlCount) / urlsPerSecond) + " seconds remaining, free mem = " + (serverMemory.free() / 1024 / 1024) + "MB");
messageTime = System.currentTimeMillis() + 5000;
}
}
if (readBuffer != null) readBuffer.close();
dumpArray.close();
dumpArray = null;
log.logInfo("finished restore: " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
} catch (kelondroException e) {
// restore failed
log.logSevere("failed restore of indexCache array dump: " + e.getMessage(), e);
} finally {
if (dumpArray != null) try {dumpArray.close();}catch(Exception e){}
}
}
}

@ -61,24 +61,11 @@ public final class indexRAMRI implements indexRI, indexRIReader {
this.cacheReferenceCountLimit = wCacheReferenceCountLimitInit;
this.cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit;
this.log = log;
File indexArrayFile = new File(databaseRoot, oldArrayName);
this.indexHeapFile = new File(databaseRoot, newHeapName);
this.heap = new indexContainerHeap(payloadrow, log);
// read in dump of last session
if (indexArrayFile.exists()) {
try {
heap.restoreArray(indexArrayFile);
for (indexContainer ic : (Iterable<indexContainer>) heap.wordContainers(null, false)) {
this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote()));
this.hashScore.setScore(ic.getWordHash(), ic.size());
}
} catch (IOException e){
log.logSevere("unable to restore cache dump: " + e.getMessage(), e);
}
indexArrayFile.delete();
if (indexArrayFile.exists()) log.logSevere("cannot delete old array file: " + indexArrayFile.toString() + "; please delete manually");
} else if (indexHeapFile.exists()) {
if (indexHeapFile.exists()) {
try {
heap.initWriteMode(indexHeapFile);
for (indexContainer ic : (Iterable<indexContainer>) heap.wordContainers(null, false)) {

@ -0,0 +1,194 @@
// kelondroHeap.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
// $LastChangedRevision: 4558 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import de.anomic.kelondro.kelondroByteOrder;
import de.anomic.kelondro.kelondroBytesLongMap;
public final class kelondroHeap {
private kelondroBytesLongMap index;
private File heapFile;
private kelondroByteOrder ordering;
/**
* create a heap file: a arbitrary number of BLOBs, indexed by an access key
* The heap file will be opened at initialization time, indexed and closed again.
* Heap files are only opened when BLOBs are read from it or new one are appended
* @param heapFile
* @param keylength
* @param ordering
* @throws IOException
*/
public kelondroHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException {
this.index = null;
this.ordering = ordering;
this.heapFile = heapFile;
if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist");
if (heapFile.length() >= (long) Integer.MAX_VALUE) throw new IOException("file " + heapFile + " too large, index can only be crated for files less than 2GB");
this.index = new kelondroBytesLongMap(keylength, this.ordering, 0);
DataInputStream is = null;
String keystring;
byte[] key = new byte[keylength];
int reclen;
long seek = 0, seek0;
is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024));
// don't test available() here because this does not work for files > 2GB
loop: while (true) {
// remember seek position
seek0 = seek;
// read length of the following record without the length of the record size bytes
try {
reclen = is.readInt();
} catch (IOException e) {
break loop; // terminate loop
}
seek += 4L;
// read key
try {
is.readFully(key);
} catch (IOException e) {
break loop; // terminate loop
}
keystring = new String(key);
seek += (long) keystring.length();
// skip content
seek += (long) reclen;
while (reclen > 0) reclen -= is.skip(reclen);
// store access address to entry
try {
index.addl(key, seek0);
} catch (IOException e) {
e.printStackTrace();
break loop;
}
}
is.close();
}
/**
* the number of BLOBs in the heap
* @return the number of BLOBs in the heap
*/
public int size() {
return this.index.size();
}
/**
* test if a key is in the heap file
* @param key
* @return true if the key exists, false othervise
*/
public boolean has(String key) {
assert index != null;
assert index.row().primaryKeyLength == key.length();
// check if the index contains the key
try {
return index.getl(key.getBytes()) >= 0;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
/**
* add a BLOB to the heap
* @param key
* @param blob
* @throws IOException
*/
public synchronized void add(String key, byte[] blob) throws IOException {
add(key, blob, 0, blob.length);
}
/**
* add a BLOB to the heap
* @param key
* @param blob
* @throws IOException
*/
public synchronized void add(String key, byte[] blob, int offset, int len) throws IOException {
assert index.row().primaryKeyLength == key.length();
if ((blob == null) || (blob.length == 0)) return;
DataOutputStream os = null;
try {
os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(heapFile)));
} catch (FileNotFoundException e) {
throw new IOException(e.getMessage());
}
int pos = os.size();
os.writeInt(len);
os.write(key.getBytes());
os.write(blob, offset, len);
os.close();
index.putl(key.getBytes(), pos);
}
/**
* read a blob from the heap
* @param key
* @return
* @throws IOException
*/
public byte[] get(String key) throws IOException {
assert index.row().primaryKeyLength == key.length();
// check if the index contains the key
long pos = index.getl(key.getBytes());
if (pos < 0) return null;
// access the file and read the container
RandomAccessFile raf = new RandomAccessFile(heapFile, "r");
int len = raf.readInt();
byte[] record = new byte[len];
raf.seek(pos + 4 + index.row().primaryKeyLength);
raf.readFully(record);
raf.close();
return record;
}
}

@ -61,14 +61,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.lang.StringBuffer;
import java.net.InetAddress;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -100,16 +98,14 @@ public final class plasmaHTCache {
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
static kelondroMapObjects responseHeaderDB = null;
private static final LinkedList<Entry> cacheStack = new LinkedList<Entry>();
private static final Map<String, File> cacheAge = Collections.synchronizedMap(new TreeMap<String, File>()); // a <date+hash, cache-path> - relation
private static final ConcurrentLinkedQueue<Entry> cacheStack = new ConcurrentLinkedQueue<Entry>();
private static final ConcurrentHashMap<String, File> cacheAge = new ConcurrentHashMap<String, File>(); // a <date+hash, cache-path> - relation
public static long curCacheSize = 0;
public static long maxCacheSize;
public static File cachePath;
public static final serverLog log = new serverLog("HTCACHE");
public static final HashSet<File> filesInUse = new HashSet<File>(); // can we delete this file
public static String cacheLayout;
public static boolean cacheMigration;
private static long lastcleanup = System.currentTimeMillis();
private static ResourceInfoFactory objFactory = new ResourceInfoFactory();
private static serverThread cacheScanThread;
@ -126,29 +122,6 @@ public final class plasmaHTCache {
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
public static final int AP_IMG = 10; // tag inside image references
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance)
public static final int AP_BOLD = 13; // may be interpreted as emphasized
public static final int AP_ITALICS = 14; // may be interpreted as emphasized
public static final int AP_WEAK = 15; // for Text that is small or bareley visible
public static final int AP_INVISIBLE = 16; // good for spam detection
public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_AUTHOR = 18; // word appears in author name
public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags)
public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags)
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
@ -229,11 +202,9 @@ public final class plasmaHTCache {
return doctype;
}
public static void init(File htCachePath, long CacheSizeMax, String layout, boolean migration) {
public static void init(File htCachePath, long CacheSizeMax) {
cachePath = htCachePath;
cacheLayout = layout;
cacheMigration = migration;
maxCacheSize = CacheSizeMax;
@ -328,9 +299,7 @@ public final class plasmaHTCache {
}
public static int size() {
synchronized (cacheStack) {
return cacheStack.size();
}
return cacheStack.size();
}
public static int dbSize() {
@ -338,17 +307,11 @@ public final class plasmaHTCache {
}
public static void push(Entry entry) {
synchronized (cacheStack) {
cacheStack.add(entry);
}
cacheStack.add(entry);
}
public static Entry pop() {
synchronized (cacheStack) {
if (cacheStack.size() > 0)
return cacheStack.removeFirst();
return null;
}
return cacheStack.poll();
}
/**
@ -388,18 +351,15 @@ public final class plasmaHTCache {
return true;
}
private static long lastcleanup = System.currentTimeMillis();
public static void writeFileAnnouncement(File file) {
synchronized (cacheAge) {
if (file.exists()) {
curCacheSize += file.length();
if (System.currentTimeMillis() - lastcleanup > 300000) {
// call the cleanup job only every 5 minutes
cleanup();
lastcleanup = System.currentTimeMillis();
}
cacheAge.put(ageString(file.lastModified(), file), file);
if (file.exists()) {
curCacheSize += file.length();
if (System.currentTimeMillis() - lastcleanup > 300000) {
// call the cleanup job only every 5 minutes
cleanup();
lastcleanup = System.currentTimeMillis();
}
cacheAge.put(ageString(file.lastModified(), file), file);
}
}
@ -419,7 +379,7 @@ public final class plasmaHTCache {
}
private static boolean deleteFile(File obj) {
if (obj.exists() && !filesInUse.contains(obj)) {
if (obj.exists()) {
long size = obj.length();
if (obj.delete()) {
curCacheSize -= size;
@ -446,41 +406,38 @@ public final class plasmaHTCache {
private static void cleanupDoIt(long newCacheSize) {
File file;
synchronized (cacheAge) {
Iterator<Map.Entry<String, File>> iter = cacheAge.entrySet().iterator();
Map.Entry<String, File> entry;
while (iter.hasNext() && curCacheSize >= newCacheSize) {
if (Thread.currentThread().isInterrupted()) return;
entry = iter.next();
String key = entry.getKey();
file = entry.getValue();
long t = Long.parseLong(key.substring(0, 16), 16);
if (System.currentTimeMillis() - t < 300000) break; // files must have been at least 5 minutes in the cache before they are deleted
if (file != null) {
if (filesInUse.contains(file)) continue;
if (log.isFinest()) log.logFinest("Trying to delete [" + key + "] = old file: " + file.toString());
// This needs to be called *before* the file is deleted
String urlHash = getHash(file);
if (deleteFileandDirs(file, "OLD")) {
try {
// As the file is gone, the entry in responseHeader.db is not needed anymore
if (urlHash != null) {
if (log.isFinest()) log.logFinest("Trying to remove responseHeader for URLhash: " + urlHash);
responseHeaderDB.remove(urlHash);
} else {
yacyURL url = getURL(file);
if (url != null) {
if (log.isFinest()) log.logFinest("Trying to remove responseHeader for URL: " + url.toNormalform(false, true));
responseHeaderDB.remove(url.hash());
}
Iterator<Map.Entry<String, File>> iter = cacheAge.entrySet().iterator();
Map.Entry<String, File> entry;
while (iter.hasNext() && curCacheSize >= newCacheSize) {
if (Thread.currentThread().isInterrupted()) return;
entry = iter.next();
String key = entry.getKey();
file = entry.getValue();
long t = Long.parseLong(key.substring(0, 16), 16);
if (System.currentTimeMillis() - t < 300000) break; // files must have been at least 5 minutes in the cache before they are deleted
if (file != null) {
if (log.isFinest()) log.logFinest("Trying to delete [" + key + "] = old file: " + file.toString());
// This needs to be called *before* the file is deleted
String urlHash = getHash(file);
if (deleteFileandDirs(file, "OLD")) {
try {
// As the file is gone, the entry in responseHeader.db is not needed anymore
if (urlHash != null) {
if (log.isFinest()) log.logFinest("Trying to remove responseHeader for URLhash: " + urlHash);
responseHeaderDB.remove(urlHash);
} else {
yacyURL url = getURL(file);
if (url != null) {
if (log.isFinest()) log.logFinest("Trying to remove responseHeader for URL: " + url.toNormalform(false, true));
responseHeaderDB.remove(url.hash());
}
} catch (IOException e) {
log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e);
}
} catch (IOException e) {
log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e);
}
}
iter.remove();
}
iter.remove();
}
}
@ -664,29 +621,6 @@ public final class plasmaHTCache {
return plasmaParser.mediaExtContains(urlString);
}
/**
* This function moves an old cached object (if it exists) to the new position
*/
private static void moveCachedObject(File oldpath, File newpath) {
try {
if (oldpath.exists() && oldpath.isFile() && (!newpath.exists())) {
long d = oldpath.lastModified();
newpath.getParentFile().mkdirs();
if (oldpath.renameTo(newpath)) {
cacheAge.put(ageString(d, newpath), newpath);
File obj = oldpath.getParentFile();
while ((!(obj.equals(cachePath))) && (obj.isDirectory()) && (obj.list().length == 0)) {
if (obj.delete()) if (log.isFine()) log.logFine("DELETED EMPTY DIRECTORY : " + obj.toString());
obj = obj.getParentFile();
}
}
}
} catch (Exception e) {
if (log.isFine()) log.logFine("moveCachedObject('" + oldpath.toString() + "','" +
newpath.toString() + "')", e);
}
}
private static String replaceRegex(String input, String regex, String replacement) {
if (input == null) { return ""; }
if (input.length() > 0) {
@ -767,34 +701,9 @@ public final class plasmaHTCache {
fileName.append('!').append(port);
}
// generate cache path according to storage method
if (cacheLayout.equals("tree")) {
File FileTree = treeFile(fileName, "tree", path);
if (cacheMigration) {
moveCachedObject(hashFile(fileName, "hash", extention, url.hash()), FileTree);
moveCachedObject(hashFile(fileName, null, extention, url.hash()), FileTree); // temporary migration
moveCachedObject(treeFile(fileName, null, path), FileTree); // temporary migration
}
return FileTree;
}
if (cacheLayout.equals("hash")) {
File FileFlat = hashFile(fileName, "hash", extention, url.hash());
if (cacheMigration) {
moveCachedObject(treeFile(fileName, "tree", path), FileFlat);
moveCachedObject(treeFile(fileName, null, path), FileFlat); // temporary migration
moveCachedObject(hashFile(fileName, null, extention, url.hash()), FileFlat); // temporary migration
}
return FileFlat;
}
return null;
}
private static File treeFile(StringBuffer fileName, String prefix, String path) {
StringBuffer f = new StringBuffer(fileName.length() + 30);
f.append(fileName);
if (prefix != null) f.append('/').append(prefix);
f.append(path);
return new File(cachePath, f.toString());
// generate cache path
File FileFlat = hashFile(fileName, "hash", extention, url.hash());
return FileFlat;
}
private static File hashFile(StringBuffer fileName, String prefix, String extention, String urlhash) {
@ -807,7 +716,6 @@ public final class plasmaHTCache {
return new File(cachePath, f.toString());
}
/**
* This is a helper function that extracts the Hash from the filename
*/

@ -593,46 +593,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
* @see plasmaSwitchboard#PROXY_CACHE_LAYOUT_TREE
* @see plasmaSwitchboard#PROXY_CACHE_LAYOUT_HASH
*/
public static final String PROXY_CACHE_LAYOUT = "proxyCacheLayout";
/**
* <p><code>public static final String <strong>PROXY_CACHE_LAYOUT_TREE</strong> = "tree"</code></p>
* <p>Setting the file-/folder-structure for {@link #PROXY_CACHE_LAYOUT}. Websites are stored in a folder-layout
* according to the layout, the URL purported. The first folder is either <code>http</code> or <code>https</code>
* depending on the protocol used to fetch the website, descending follows the hostname and the sub-folders on the
* website up to the actual file itself.</p>
* <p>When using <code>tree</code>, be aware that
* the possibility of inconsistencies between folders and files with the same name may occur which prevent proper
* storage of the fetched site. Below is an example how files are stored:</p>
* <pre>
* /html/
* /html/www.example.com/
* /html/www.example.com/index/
* /html/www.example.com/index/en/
* /html/www.example.com/index/en/index.html</pre>
*/
public static final String PROXY_CACHE_LAYOUT_TREE = "tree";
/**
* <p><code>public static final String <strong>PROXY_CACHE_LAYOUT_HASH</strong> = "hash"</code></p>
* <p>Setting the file-/folder-structure for {@link #PROXY_CACHE_LAYOUT}. Websites are stored using the MD5-sum of
* their respective URLs. This method prevents collisions on some websites caused by using the {@link #PROXY_CACHE_LAYOUT_TREE}
* layout.</p>
* <p>Similarly to {@link #PROXY_CACHE_LAYOUT_TREE}, the top-folders name is given by the protocol used to fetch the site,
* followed by either <code>www</code> or &ndash; if the hostname does not start with "www" &ndash; <code>other</code>.
* Afterwards the next folder has the rest of the hostname as name, followed by a folder <code>hash</code> which contains
* a folder consisting of the first two letters of the hash. Another folder named after the 3rd and 4th letters of the
* hash follows which finally contains the file named after the full 18-characters long hash.
* Below is an example how files are stored:</p>
* <pre>
* /html/
* /html/www/
* /html/www/example.com/
* /html/www/example.com/hash/
* /html/www/example.com/hash/0d/
* /html/www/example.com/hash/0d/f8/
* /html/www/example.com/hash/0d/f8/0df83a8444f48317d8</pre>
*/
public static final String PROXY_CACHE_LAYOUT_HASH = "hash";
public static final String PROXY_CACHE_MIGRATION = "proxyCacheMigration";
//////////////////////////////////////////////////////////////////////////////////////////////
// Cluster settings
@ -1087,9 +1047,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
htCachePath = getConfigPath(HTCACHE_PATH, HTCACHE_PATH_DEFAULT);
this.log.logInfo("HTCACHE Path = " + htCachePath.getAbsolutePath());
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig(PROXY_CACHE_SIZE, "2")); // this is megabyte
String cacheLayout = getConfig(PROXY_CACHE_LAYOUT, PROXY_CACHE_LAYOUT_TREE);
boolean cacheMigration = getConfigBool(PROXY_CACHE_MIGRATION, true);
plasmaHTCache.init(htCachePath, maxCacheSize, cacheLayout, cacheMigration);
plasmaHTCache.init(htCachePath, maxCacheSize);
// create the release download directory
releasePath = getConfigPath(RELEASE_PATH, RELEASE_PATH_DEFAULT);
@ -1139,19 +1097,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
// create queue
this.sbQueue = new plasmaSwitchboardQueue(wordIndex, new File(this.plasmaPath, "switchboardQueue2.stack"), this.profilesActiveCrawls);
// going through the sbQueue Entries and registering all content files as in use
int count = 0;
plasmaSwitchboardQueue.QueueEntry queueEntry;
Iterator<plasmaSwitchboardQueue.QueueEntry> i1 = sbQueue.entryIterator(true);
while (i1.hasNext()) {
queueEntry = i1.next();
if ((queueEntry != null) && (queueEntry.url() != null) && (queueEntry.cacheFile().exists())) {
plasmaHTCache.filesInUse.add(queueEntry.cacheFile());
count++;
}
}
this.log.logConfig(count + " files in htcache reported to the cachemanager as in use.");
// define an extension-blacklist
log.logConfig("Parser: Initializing Extension Mappings for Media/Parser");
plasmaParser.initMediaExt(plasmaParser.extString2extList(getConfig(PARSER_MEDIA_EXT,"")));
@ -1696,12 +1641,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
/* =========================================================================
* INDEXING
* ========================================================================= */
if (doIndexing && isSupportedContent){
// registering the cachefile as in use
if (entry.cacheFile().exists()) {
plasmaHTCache.filesInUse.add(entry.cacheFile());
}
if (doIndexing && isSupportedContent) {
// enqueue for further crawling
enQueue(this.sbQueue.newEntry(
@ -2152,10 +2092,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
document = null;
}
if (document == null) {
if (!in.queueEntry.profile().storeHTCache()) {
plasmaHTCache.filesInUse.remove(in.queueEntry.cacheFile());
//plasmaHTCache.deleteURLfromCache(entry.url());
}
in.queueEntry.close();
return null;
}
@ -2235,10 +2171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
condenser = null;
}
if (condenser == null) {
if (!in.queueEntry.profile().storeHTCache()) {
plasmaHTCache.filesInUse.remove(in.queueEntry.cacheFile());
//plasmaHTCache.deleteURLfromCache(entry.url());
}
in.queueEntry.close();
return null;
}
@ -2305,10 +2237,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
public void storeDocumentIndex(indexingQueueEntry in) {
in.queueEntry.updateStatus(plasmaSwitchboardQueue.QUEUE_STATE_INDEXSTORAGE);
storeDocumentIndex(in.queueEntry, in.document, in.condenser);
if (!in.queueEntry.profile().storeHTCache()) {
plasmaHTCache.filesInUse.remove(in.queueEntry.cacheFile());
//plasmaHTCache.deleteURLfromCache(entry.url());
}
in.queueEntry.updateStatus(plasmaSwitchboardQueue.QUEUE_STATE_FINISHED);
in.queueEntry.close();
}

Loading…
Cancel
Save