replaced indexing ram-queue by file-based stack-queue

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@381 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 4851b432e1
commit 858cd94299

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.384
releaseVersion=0.385
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -46,11 +46,13 @@
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardQueue;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -83,26 +85,27 @@ public class IndexCreateIndexingQueue_p {
boolean dark;
int i;
if (switchboard.queueStack.size() == 0) {
if (switchboard.sbQueue.size() == 0) {
prop.put("indexing-queue", 0); //is empty
} else {
prop.put("indexing-queue", 1);
prop.put("indexing-queue_num", switchboard.queueStack.size());//num entries in queue
prop.put("indexing-queue_num", switchboard.sbQueue.size());//num entries in queue
dark = true;
plasmaHTCache.Entry pcentry;
for (i = 0; i < switchboard.queueStack.size(); i++) {
pcentry = (plasmaHTCache.Entry) switchboard.queueStack.get(i);
plasmaSwitchboardQueue.Entry pcentry;
for (i = 0; i < switchboard.sbQueue.size(); i++) try {
pcentry = (plasmaSwitchboardQueue.Entry) switchboard.sbQueue.get(i);
if (pcentry != null) {
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
prop.put("indexing-queue_list_"+i+"_dark", ((dark) ? 1 : 0));
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth);
prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified));
prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size())));
prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) );
prop.put("indexing-queue_list_"+i+"_url", pcentry.nomalizedURLString);
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+i+"_href", pcentry.forkFactor());
prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName());
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString());
dark = !dark;
}
} catch (IOException e) {
}
prop.put("indexing-queue_list", i);
}

@ -181,7 +181,7 @@ public class IndexCreate_p {
prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? 1 : 0);
prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? 1 : 0);
int queueStackSize = switchboard.queueStack.size();
int queueStackSize = switchboard.sbQueue.size();
int loaderThreadsSize = switchboard.cacheLoader.size();
int crawlerListSize = switchboard.urlPool.noticeURL.stackSize();
int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize;

@ -51,9 +51,9 @@ import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import java.util.Iterator;
import de.anomic.http.httpdProxyHandler;
import de.anomic.plasma.plasmaSwitchboard;

@ -44,21 +44,19 @@
package de.anomic.data;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Hashtable;
import java.io.InputStreamReader;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import de.anomic.server.logging.serverLog;
import de.anomic.data.listManager;
/**
* Wordlist based translator

File diff suppressed because one or more lines are too long

@ -40,8 +40,8 @@
package de.anomic.htmlFilter;
import java.util.TreeSet;
import java.util.Properties;
import java.util.TreeSet;
public abstract class htmlFilterAbstractTransformer implements htmlFilterTransformer {

@ -40,14 +40,14 @@
package de.anomic.htmlFilter;
import java.net.URL;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.Collator;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Locale;
import java.text.Collator;
import java.util.TreeSet;
import de.anomic.server.serverByteBuffer;

@ -43,11 +43,11 @@ package de.anomic.htmlFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.TreeSet;
import java.text.Collator;
import java.util.Locale;
import java.util.Properties;
import java.util.TreeSet;
import java.util.Vector;
import java.util.Locale;
import java.text.Collator;
import de.anomic.server.serverByteBuffer;

@ -42,16 +42,16 @@
package de.anomic.http;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.File;
import java.util.Hashtable;
import de.anomic.server.serverFileUtils;

@ -76,8 +76,6 @@ import java.util.zip.GZIPInputStream;
import javax.net.ssl.SSLSocketFactory;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@ -85,6 +83,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCore.Session;
import org.apache.commons.pool.impl.GenericObjectPool;
public final class httpc {

@ -57,8 +57,6 @@ import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.util.Properties;
import de.anomic.server.serverSwitch;
public interface httpdHandler {
void doGet(Properties conProp, httpHeader header, OutputStream response) throws IOException;

@ -72,7 +72,6 @@ import java.net.ConnectException;
import java.net.MalformedURLException;
import java.net.NoRouteToHostException;
import java.net.Socket;
import java.net.SocketException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;
@ -637,18 +636,20 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if (sizeBeforeDelete == -1) {
// totally fresh file
cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(cacheEntry, cacheArray);
cacheEntry.cacheArray = cacheArray;
cacheManager.push(cacheEntry);
conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS");
} else if (sizeBeforeDelete == cacheArray.length) {
// before we came here we deleted a cache entry
cacheArray = null;
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(cacheEntry); // unnecessary update
cacheManager.push(cacheEntry); // unnecessary update
conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT");
} else {
// before we came here we deleted a cache entry
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(cacheEntry, cacheArray); // necessary update, write response header to cache
cacheEntry.cacheArray = cacheArray;
cacheManager.push(cacheEntry); // necessary update, write response header to cache
conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS");
}
} else {
@ -661,15 +662,15 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if (sizeBeforeDelete == -1) {
// totally fresh file
cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(cacheEntry);
cacheManager.push(cacheEntry);
} else if (sizeBeforeDelete == cacheFile.length()) {
// before we came here we deleted a cache entry
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(cacheEntry); // unnecessary update
cacheManager.push(cacheEntry); // unnecessary update
} else {
// before we came here we deleted a cache entry
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(cacheEntry); // necessary update, write response header to cache
cacheManager.push(cacheEntry); // necessary update, write response header to cache
}
// beware! all these writings will not fill the cacheEntry.cacheArray
// that means they are not available for the indexer (except they are scraped before)
@ -682,11 +683,11 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if (sizeBeforeDelete == -1) {
// no old file and no load. just data passing
cacheEntry.status = plasmaHTCache.CACHE_PASSING;
cacheManager.stackProcess(cacheEntry);
cacheManager.push(cacheEntry);
} else {
// before we came here we deleted a cache entry
cacheEntry.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
cacheManager.stackProcess(cacheEntry);
cacheManager.push(cacheEntry);
}
}

@ -43,8 +43,8 @@ package de.anomic.kelondro;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Map;
import java.util.TreeMap;
public class kelondroMScoreCluster {

@ -45,8 +45,6 @@ package de.anomic.kelondro;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.Map;
public class kelondroMap {

@ -41,8 +41,8 @@
package de.anomic.kelondro;
import java.util.Iterator;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Set;
public class kelondroMergeIterator implements Iterator {

@ -56,7 +56,6 @@ import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
public class bzipParser extends AbstractParser implements Parser {
/**

@ -63,8 +63,8 @@ import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;

@ -45,14 +45,12 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Hashtable;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog;
import org.apache.commons.pool.impl.GenericObjectPool;
public final class plasmaCrawlLoader extends Thread {
private final plasmaHTCache cacheManager;

@ -47,7 +47,6 @@ import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;

@ -49,8 +49,8 @@ import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMap;
import de.anomic.server.serverCodings;
public class plasmaCrawlProfile {

@ -53,7 +53,6 @@ import java.util.logging.Logger;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdProxyHandler;
import de.anomic.server.logging.serverLog;
import de.anomic.server.logging.serverMiniLogFormatter;
@ -331,13 +330,7 @@ public final class plasmaCrawlWorker extends Thread {
}
// enQueue new entry with response header
if (profile != null) {
if ((initiator == null) || (initiator.length() == 0)) {
// enqueued for proxy writings
cacheManager.stackProcess(htCache);
} else {
// direct processing for crawling
cacheManager.process(htCache);
}
cacheManager.push(htCache);
}
} catch (SocketException e) {
// this may happen if the client suddenly closes its connection

@ -52,13 +52,12 @@ package de.anomic.plasma;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.Map;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -76,9 +75,8 @@ public final class plasmaHTCache {
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
private static final long idleDelay = 2000; // 2 seconds no hits until we think that we idle
private static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
private final plasmaSwitchboard switchboard;
private kelondroMap responseHeaderDB = null;
private final LinkedList cacheStack;
private final TreeMap cacheAge; // a <date+hash, cache-path> - relation
@ -96,20 +94,21 @@ public final class plasmaHTCache {
public static final int CACHE_STALE_RELOAD_BAD = 5; // this updates only the responseHeader, not the content
public static final int CACHE_PASSING = 6; // does not touch cache, just passing
public plasmaHTCache(plasmaSwitchboard switchboard, int bufferkb) {
this.switchboard = switchboard;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) {
//this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
this.maxCacheSize = maxCacheSize;
// set cache path
cachePath = new File(switchboard.getRootPath(),switchboard.getConfig("proxyCache","HTCACHE"));
if (!(cachePath.exists())) {
if (!(htCachePath.exists())) {
// make the cache path
cachePath.mkdir();
htCachePath.mkdir();
}
if (!(cachePath.isDirectory())) {
if (!(htCachePath.isDirectory())) {
// if the cache does not exists or is a file and not a directory, panic
System.out.println("the cache path " + cachePath.toString() + " is not a directory or does not exists and cannot be created");
System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
}
@ -134,13 +133,87 @@ public final class plasmaHTCache {
// init cache age and size management
cacheAge = new TreeMap();
currCacheSize = 0;
maxCacheSize = 1024 * 1024 * Long.parseLong(switchboard.getConfig("proxyCacheSize", "2")); // this is megabyte
this.maxCacheSize = maxCacheSize;
// start the cache startup thread
// this will collect information about the current cache size and elements
serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000);
}
public int size() {
return cacheStack.size();
}
public void push(Entry entry) {
cacheStack.add(entry);
}
public Entry pop() {
return (Entry) cacheStack.removeFirst();
}
public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException {
responseHeaderDB.set(urlHash, responseHeader);
}
public boolean deleteFile(URL url) {
File file = getCachePath(url);
if (file.exists()) {
currCacheSize -= file.length();
return file.delete();
} else {
return false;
}
}
public boolean writeFile(URL url, byte[] array) {
if (array == null) return false;
try {
File file = getCachePath(url);
if (file.exists()) {
currCacheSize -= file.length();
file.delete();
}
file.getParentFile().mkdirs();
serverFileUtils.write(array, file);
currCacheSize += file.length();
cacheAge.put(ageString(file.lastModified(), file), file);
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
// by the shallStoreCache() property. However, sometimes the error still occurs
// In this case do nothing.
log.logError("File storage failed (not a directory): " + e.getMessage());
return false;
} catch (IOException e) {
log.logError("File storage failed (IO error): " + e.getMessage());
return false;
}
cleanup();
return true;
}
private void cleanup() {
// clean up cache to have enough space for next entries
File f;
while (currCacheSize > maxCacheSize) {
f = (File) cacheAge.remove(cacheAge.firstKey());
if (f.exists()) {
currCacheSize -= f.length();
if (f.delete()) {
log.logInfo("DELETED OLD CACHE : " + f.toString());
f = f.getParentFile();
if ((f.exists()) && (f.isDirectory())) {
// check size of directory
if (f.list().length == 0) {
// the directory has no files in it; delete it also
if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
}
}
}
}
}
public void close() throws IOException {
responseHeaderDB.close();
}
@ -172,8 +245,13 @@ public final class plasmaHTCache {
cacheAge.put(ageString(d, f), f);
}
//System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey()));
long ageHours = (System.currentTimeMillis() -
Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000;
long ageHours = 0;
try {
ageHours = (System.currentTimeMillis() -
Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000;
} catch (NumberFormatException e) {
e.printStackTrace();
}
log.logSystem("CACHE SCANNED, CONTAINS " + c +
" FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) +
@ -224,145 +302,7 @@ public final class plasmaHTCache {
public boolean empty() {
return (cacheStack.size() == 0);
}
synchronized public void stackProcess(Entry entry) throws IOException {
lastAcc = System.currentTimeMillis();
if (full())
process(entry);
else
cacheStack.add(entry);
}
synchronized public void stackProcess(Entry entry, byte[] cacheArray) throws IOException {
lastAcc = System.currentTimeMillis();
entry.cacheArray = cacheArray;
if (full())
process(entry);
else
cacheStack.add(entry);
}
public int size() {
return cacheStack.size();
}
synchronized public void process(Entry entry) throws IOException {
if (entry == null) return;
// store response header
if ((entry.status == CACHE_FILL) ||
(entry.status == CACHE_STALE_RELOAD_GOOD) ||
(entry.status == CACHE_STALE_RELOAD_BAD)) {
responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader);
}
// work off unwritten files and undone parsing
String storeError = null;
if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) &&
((storeError = entry.shallStoreCache()) == null)) {
// write file if not written yet
if (entry.cacheArray != null) try {
if (entry.cacheFile.exists()) {
currCacheSize -= entry.cacheFile.length();
entry.cacheFile.delete();
}
entry.cacheFile.getParentFile().mkdirs();
log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
serverFileUtils.write(entry.cacheArray, entry.cacheFile);
log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full"));
//entry.cacheArray = null;
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
// by the shallStoreCache() property. However, sometimes the error still occurs
// In this case do nothing.
log.logError("File storage failed: " + e.getMessage());
}
// update statistics
currCacheSize += entry.cacheFile.length();
cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile);
// enqueue in switchboard
switchboard.enQueue(entry);
} else if (entry.status == CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed
if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError);
// enqueue in switchboard
switchboard.enQueue(entry);
}
// write log
switch (entry.status) {
case CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break;
case CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
case CACHE_STALE_NO_RELOAD:
log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_GOOD:
log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_BAD:
log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break;
case CACHE_PASSING:
log.logInfo("PASSING: " + entry.cacheFile); break;
default:
log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break;
}
}
public boolean job() {
if (empty()) return false;
try {
File f;
int workoff;
workoff = 1 + cacheStack.size() / 10;
// we want to work off always 10 % to prevent that we collaps
while ((workoff-- > 0) && (!(empty()))) {
process((Entry) cacheStack.removeFirst());
}
// loop until we are not idle or nothing more to do
while ((!empty()) && (idle())) {
// work off stack and store entries to file system
process((Entry) cacheStack.removeFirst());
// clean up cache to have enough space for next entries
while (currCacheSize > maxCacheSize) {
f = (File) cacheAge.remove(cacheAge.firstKey());
if (f.exists()) {
currCacheSize -= f.length();
if (f.delete()) {
log.logInfo("DELETED OLD CACHE : " + f.toString());
f = f.getParentFile();
if ((f.exists()) && (f.isDirectory())) {
// check size of directory
if (f.list().length == 0) {
// the directory has no files in it; delete it also
if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
}
}
}
}
}
} catch (IOException e) {
System.out.println("The proxy cache manager has died because of an IO-problem: " + e.getMessage());
e.printStackTrace(System.out);
System.exit(-1);
}
return true;
}
public static boolean isPicture(httpHeader response) {
Object ct = response.get(httpHeader.CONTENT_TYPE);
if (ct == null) return false;
@ -803,184 +743,6 @@ public final class plasmaHTCache {
return true;
}
public String shallIndexCacheForProxy() {
// decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
// to reject the crawling demand in clear text
// check profile
if (!(profile.localIndexing())) return "Indexing_Not_Allowed";
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request
// we checked that in shallStoreCache
// -ranges in request
// we checked that in shallStoreCache
// a picture cannot be indexed
if (isPicture(responseHeader)) return "Media_Content_(Picture)";
if (!(isText(responseHeader))) return "Media_Content_(not_text)";
if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)";
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
if ((requestHeader != null) &&
(requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) &&
(responseHeader.containsKey(httpHeader.LAST_MODIFIED))) {
// parse date
Date d1, d2;
d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) {
//System.out.println("***not indexed because if-modified-since");
return "Stale_(Last-Modified>Modified-Since)";
}
}
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
if ((requestHeader != null) && (requestHeader.containsKey(httpHeader.COOKIE))) {
//System.out.println("***not indexed because cookie");
return "Dynamic_(Requested_With_Cookie)";
}
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// -pragma in cached response
if ((responseHeader.containsKey(httpHeader.PRAGMA)) &&
(((String) responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)";
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// calculate often needed values for freshness attributes
Date date = responseHeader.date();
Date expires = responseHeader.expires();
Date lastModified = responseHeader.lastModified();
String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL);
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
if (expires != null) {
Date yesterday = new Date((new Date()).getTime() - oneday);
if (expires.before(yesterday)) return "Stale_(Expired)";
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
/* we have the following cases for cache-control:
"public" -- can be indexed
"private", "no-cache", "no-store" -- cannot be indexed
"max-age=<delta-seconds>" -- stale/fresh dependent on date
*/
if (cacheControl.startsWith("PUBLIC")) {
// ok, do nothing
} else if ((cacheControl.startsWith("PRIVATE")) ||
(cacheControl.startsWith("NO-CACHE")) ||
(cacheControl.startsWith("NO-STORE"))) {
// easy case
return "Stale_(denied_by_cache-control=" + cacheControl+ ")";
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) return "Stale_(no_date_given_in_response)";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if ((new Date()).getTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "Stale_(expired_by_cache-control)";
}
} catch (Exception e) {
return "Error_(" + e.getMessage() + ")";
}
}
}
return null;
}
public String shallIndexCacheForCrawler() {
// decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
// to reject the crawling demand in clear text
// check profile
if (!(profile.localIndexing())) return "Indexing_Not_Allowed";
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request
// we checked that in shallStoreCache
// -ranges in request
// we checked that in shallStoreCache
// a picture cannot be indexed
if (isPicture(responseHeader)) return "Media_Content_(Picture)";
if (!(isText(responseHeader))) return "Media_Content_(not_text)";
if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)";
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
// -> this does not apply for the crawler
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
// -> this does not apply for a crawler
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// -> this does not apply for a crawler
// -pragma in cached response
// -> in the crawler we ignore this
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
// -> this does not apply for a crawler
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
// -> in the crawler we ignore this
return null;
}
}

@ -68,17 +68,16 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.pool.KeyedPoolableObjectFactory;
import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedUploader;
import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.KeyedPoolableObjectFactory;
import org.apache.commons.pool.impl.GenericObjectPool;
public final class plasmaParser {
@ -264,15 +263,22 @@ public final class plasmaParser {
}
}
public static boolean supportedFileExtContains(String mediaExt) {
public static boolean supportedFileExt(URL url) {
String name = url.getFile();
int p = name.lastIndexOf('.');
if (p < 0) return true; // seams to be strange, but this is a directory entry or default file (html)
return supportedFileExtContains(name.substring(p + 1));
}
public static boolean supportedFileExtContains(String fileExt) {
if (supportedFileExt == null) return false;
synchronized(supportedFileExt) {
if (supportedFileExt.contains(mediaExt)) return true;
if (supportedFileExt.contains(fileExt)) return true;
}
synchronized (supportedRealtimeFileExt) {
return supportedRealtimeFileExt.contains(mediaExt);
return supportedRealtimeFileExt.contains(fileExt);
}
}

@ -42,15 +42,17 @@
package de.anomic.plasma;
import java.util.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
import de.anomic.http.httpHeader;
import de.anomic.yacy.yacySearch;
public class plasmaSnippetCache {

@ -114,7 +114,6 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
@ -130,7 +129,6 @@ import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSemaphore;
@ -149,6 +147,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load slots
public static int crawlSlots = 10;
public static int indexingSlots = 100;
// couloured list management
public static TreeSet blueList = null;
@ -164,7 +163,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache;
public plasmaCrawlLoader cacheLoader;
public LinkedList queueStack = new LinkedList();
public plasmaSwitchboardQueue sbQueue;
public messageBoard messageDB;
public wikiBoard wikiDB;
public String remoteProxyHost;
@ -256,11 +255,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a cache manager
log.logSystem("Starting HT Cache Manager");
this.cacheManager = new plasmaHTCache(this, ramHTTP);
File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE"));
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP);
// make parser
log.logSystem("Starting Parser");
this.parser = new plasmaParser();
this.parser = new plasmaParser();
// initialize switchboard queue
sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue0.stack"), 10, profiles);
// define an extension-blacklist
log.logSystem("Parser: Initializing Extension Mappings for Media/Parser");
@ -347,7 +351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new serverInstantThread(this, "deQueue", "queueSize"), 10000 + (i * 1000));
}
deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack",
new serverInstantThread(cacheManager, "job", "size"), 10000);
new serverInstantThread(this, "htEntryStoreJob", "htEntrySize"), 10000);
deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer",
new serverInstantThread(this, "remoteTriggeredCrawlJob", "remoteTriggeredCrawlJobSize"), 30000);
deployThread("61_globalcrawltrigger", "Global Crawl Trigger", "thread that triggeres remote peers for crawling",
@ -423,7 +427,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (IOException e) {}
}
private void cleanProfiles() {
if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return;
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return;
Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
try {
@ -440,6 +444,100 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return cacheManager;
}
synchronized public void htEntryStoreEnqueued(plasmaHTCache.Entry entry) throws IOException {
if (cacheManager.full())
htEntryStoreProcess(entry);
else
cacheManager.push(entry);
}
synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException {
if (entry == null) return false;
// store response header
if ((entry.status == plasmaHTCache.CACHE_FILL) ||
(entry.status == plasmaHTCache.CACHE_STALE_RELOAD_GOOD) ||
(entry.status == plasmaHTCache.CACHE_STALE_RELOAD_BAD)) {
cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader);
}
// work off unwritten files and undone parsing
String storeError = null;
if (((entry.status == plasmaHTCache.CACHE_FILL) || (entry.status == plasmaHTCache.CACHE_STALE_RELOAD_GOOD)) &&
((storeError = entry.shallStoreCache()) == null)) {
// write file if not written yet
if (entry.cacheArray != null) {
cacheManager.writeFile(entry.url, entry.cacheArray);
log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
}
// enqueue for further crawling
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(),
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
));
} else if (entry.status == plasmaHTCache.CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed
if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError);
// enqueue for further crawling
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(),
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
));
}
// write log
switch (entry.status) {
case plasmaHTCache.CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break;
case plasmaHTCache.CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_STALE_NO_RELOAD:
log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_STALE_RELOAD_GOOD:
log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_STALE_RELOAD_BAD:
log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_PASSING:
log.logInfo("PASSING: " + entry.cacheFile); break;
default:
log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break;
}
return true;
}
public boolean htEntryStoreJob() {
if (cacheManager.empty()) return false;
try {
return htEntryStoreProcess(cacheManager.pop());
} catch (IOException e) {
return false;
}
}
public int htEntrySize() {
return cacheManager.size();
}
private static TreeSet loadList(File file) {
TreeSet list = new TreeSet(kelondroMSetTools.fastStringComparator);
if (!(file.exists())) return list;
@ -487,7 +585,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
public int queueSize() {
return queueStack.size();
return sbQueue.size();
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
@ -502,16 +600,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public void enQueue(Object job) {
plasmaHTCache.Entry entry = (plasmaHTCache.Entry) job;
queueStack.addLast(entry);
if (!(job instanceof plasmaSwitchboardQueue.Entry)) {
System.out.println("internal error at plasmaSwitchboard.enQueue: wrong job type");
System.exit(0);
}
try {
sbQueue.push((plasmaSwitchboardQueue.Entry) job);
} catch (IOException e) {
log.logError("IOError in plasmaSwitchboard.enQueue: " + e.getMessage());
e.printStackTrace();
}
}
public boolean deQueue() {
// work off fresh entries from the proxy or from the crawler
plasmaHTCache.Entry nextentry;
synchronized (queueStack) {
if (queueStack.size() == 0) {
plasmaSwitchboardQueue.Entry nextentry;
synchronized (sbQueue) {
if (sbQueue.size() == 0) {
//log.logDebug("DEQUEUE: queue is empty");
return false; // nothing to do
}
@ -521,12 +627,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do one processing step
log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") +
", queueStack=" + queueStack.size() +
", sbQueueSize=" + sbQueue.size() +
", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) +
", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
nextentry = (plasmaHTCache.Entry) queueStack.removeFirst();
try {
nextentry = sbQueue.pop();
} catch (IOException e) {
log.logError("IOError in plasmaSwitchboard.deQueue: " + e.getMessage());
e.printStackTrace();
return false;
}
}
processResourceStack(nextentry);
return true;
@ -601,13 +713,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
if (queueStack.size() >= crawlSlots) {
log.logDebug("CoreCrawl: too many processes in queue, dismissed (" +
"queueStack=" + queueStack.size() + ")");
if (sbQueue.size() >= indexingSlots) {
log.logDebug("CoreCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sbQueue.size() + ")");
return false;
}
if (cacheLoader.size() >= crawlSlots) {
log.logDebug("CoreCrawl: too many loader in queue, dismissed (" +
log.logDebug("CoreCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + cacheLoader.size() + ")");
return false;
}
@ -688,7 +800,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote =
((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (queueStack.size() != 0)) /* should do ourself */ &&
((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) /* should do ourself */ &&
(profile.remoteIndexing()) /* granted */ &&
(urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ &&
((yacyCore.seedDB.mySeed.isSenior()) ||
@ -700,9 +812,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// alternatively do a local crawl
if (queueStack.size() >= crawlSlots) {
if (sbQueue.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many processes in queue, dismissed (" +
"queueStack=" + queueStack.size() + ")");
"sbQueueSize=" + sbQueue.size() + ")");
return false;
}
if (cacheLoader.size() >= crawlSlots) {
@ -776,8 +888,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
processLocalCrawling(urlEntry, profile, stats);
return true;
}
private void processResourceStack(plasmaHTCache.Entry entry) {
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page)
try {
// we must distinguish the following cases: resource-load was initiated by
@ -802,39 +914,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
processCase = 6;
}
log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
log.logDebug("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() +
", maxDepth=" + ((entry.profile() == null) ? "null" : "" + entry.profile().generalDepth()) +
", filter=" + ((entry.profile() == null) ? "null" : "" + entry.profile().generalFilter()) +
", initiatorHash=" + initiatorHash +
", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + entry.url()); // DEBUG
// parse content
plasmaParserDocument document = null;
if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) {
if (entry.scraper != null) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else if (entry.cacheArray != null) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from cacheArray");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
if ((plasmaParser.supportedFileExt(entry.url())) ||
((entry.responseHeader() != null) &&
(plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) {
if (entry.cacheFile().exists()) {
log.logDebug("(Parser) '" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), (entry.responseHeader() == null) ? null : entry.responseHeader().mime(), entry.cacheFile());
} else {
if (entry.cacheFile.exists()) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheFile);
} else {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' cannot be parsed, no resource available");
return;
}
log.logDebug("(Parser) '" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
return;
}
if (document == null) {
log.logError("(Parser) '" + entry.nomalizedURLString + "' parse failure");
log.logError("(Parser) '" + entry.normalizedURLString() + "' parse failure");
return;
}
} else {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "'. Unsupported mimeType '" + entry.responseHeader.mime() + "'.");
log.logDebug("(Parser) '" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((entry.responseHeader() == null) ? null : entry.responseHeader().mime()) + "'.");
return;
}
Date loadDate = entry.responseHeader().lastModified();
if (loadDate == null) loadDate = entry.responseHeader().date();
if (loadDate == null) loadDate = new Date();
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&
(entry.depth < entry.profile.generalDepth())) {
((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))) {
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nexturlstring;
@ -844,15 +960,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (i.hasNext()) {
e = (Map.Entry) i.next();
nexturlstring = (String) e.getKey();
rejectReason = stackCrawl(nexturlstring, entry.nomalizedURLString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile);
rejectReason = stackCrawl(nexturlstring, entry.normalizedURLString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile());
if (rejectReason == null) {
c++;
} else {
urlPool.errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
urlPool.errorURL.newEntry(new URL(nexturlstring), entry.normalizedURLString(), entry.initiator(), yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url().toString() +
", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
}
@ -870,51 +987,56 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
if (noIndexReason == null) {
// strip out words
log.logDebug("(Profile) Condensing for '" + entry.nomalizedURLString + "'");
log.logDebug("(Profile) Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'");
log.logDebug("(Profile) Create LURL-Entry for '" + entry.normalizedURLString() + "', " +
"responseHeader=" + entry.responseHeader().toString());
Date lastModified = entry.responseHeader().lastModified();
if (lastModified == null) lastModified = entry.responseHeader().date();
if (lastModified == null) lastModified = new Date();
plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry(
entry.url, descr, entry.lastModified, new Date(),
entry.url(), descr, lastModified, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
referrerHash,
0, true,
Integer.parseInt(condenser.getAnalysis().getProperty("INFORMATION_VALUE","0"), 16),
entry.language, entry.doctype,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(entry.responseHeader().mime()),
entry.size(),
(int) Long.parseLong(condenser.getAnalysis().getProperty("NUMB_WORDS","0"), 16),
processCase
);
String urlHash = newEntry.hash();
log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'");
log.logDebug("(Profile) Remove NURL for '" + entry.normalizedURLString() + "'");
urlPool.noticeURL.remove(urlHash); // worked-off
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) &&
(entry.profile.localIndexing())) {
(entry.profile().localIndexing())) {
// remove stopwords
log.logDebug("(Profile) Exclude Stopwords for '" + entry.nomalizedURLString + "'");
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url);
log.logDebug("(Profile) Exclude Stopwords for '" + entry.normalizedURLString() + "'");
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
//System.out.println("DEBUG: words left to be indexed: " + condenser.getWords());
// do indexing
log.logDebug("(Profile) Create Index for '" + entry.nomalizedURLString + "'");
int words = searchManager.addPageIndex(entry.url, urlHash, entry.lastModified, condenser, entry.language, entry.doctype);
log.logInfo("Indexed " + words + " words in URL " + entry.url + " (" + descr + ")");
log.logDebug("(Profile) Create Index for '" + entry.normalizedURLString() + "'");
int words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(entry.responseHeader().mime()));
log.logInfo("Indexed " + words + " words in URL " + entry.url() + " (" + descr + ")");
// if this was performed for a remote crawl request, notify requester
if ((processCase == 6) && (initiator != null)) {
log.logInfo("Sending crawl receipt for '" + entry.nomalizedURLString + "' to " + initiator.getName());
log.logInfo("Sending crawl receipt for '" + entry.normalizedURLString() + "' to " + initiator.getName());
yacyClient.crawlReceipt(initiator, "crawl", "fill", "indexed", newEntry, "");
}
} else {
log.logDebug("Resource '" + entry.nomalizedURLString + "' not indexed (indexing is off)");
log.logDebug("Resource '" + entry.normalizedURLString() + "' not indexed (indexing is off)");
}
} catch (Exception ee) {
log.logError("Could not index URL " + entry.url + ": " + ee.getMessage());
log.logError("Could not index URL " + entry.url() + ": " + ee.getMessage());
ee.printStackTrace();
if ((processCase == 6) && (initiator != null)) {
yacyClient.crawlReceipt(initiator, "crawl", "exception", ee.getMessage(), null, "");
@ -922,8 +1044,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
} else {
log.logInfo("Not indexed any word in URL " + entry.url + "; cause: " + noIndexReason);
urlPool.errorURL.newEntry(entry.url, referrerHash,
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
urlPool.errorURL.newEntry(entry.url(), referrerHash,
((entry.proxy()) ? plasmaURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(plasmaURL.urlFlagLength), true);
@ -1464,7 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public String toString() {
// it is possible to use this method in the cgi pages.
// actually it is used there for testing purpose
return "PROPS: " + super.toString() + "; QUEUE: " + queueStack.toString();
return "PROPS: " + super.toString() + "; QUEUE: " + sbQueue.toString();
}
// method for index deletion
@ -1536,7 +1658,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
long starttime = System.currentTimeMillis();
try {
if (
(queueStack.size() == 0) &&
(sbQueue.size() == 0) &&
(cacheLoader.size() == 0) &&
(urlPool.noticeURL.stackSize() == 0) &&
(getConfig("allowDistributeIndex", "false").equals("true")) &&

@ -49,13 +49,12 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.yacy.yacySeedDB;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndex {

@ -52,10 +52,13 @@
package de.anomic.plasma;
import java.io.*;
import java.util.*;
import java.lang.RuntimeException;
import de.anomic.kelondro.*;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexAssortment {

@ -47,8 +47,10 @@
package de.anomic.plasma;
import java.io.File;
import java.util.*;
import de.anomic.kelondro.*;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexAssortmentCluster {

@ -42,10 +42,17 @@
package de.anomic.plasma;
import java.io.*;
import java.util.*;
import java.lang.RuntimeException;
import de.anomic.kelondro.*;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroStack;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;

@ -60,11 +60,14 @@
package de.anomic.plasma;
import java.io.*;
import java.util.*;
import de.anomic.server.*;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog;
import de.anomic.kelondro.*;
public class plasmaWordIndexClassicCacheMigration {

@ -44,9 +44,12 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.kelondro.*;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;

@ -48,9 +48,9 @@ package de.anomic.plasma;
import java.net.URL;
import java.util.Properties;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public class plasmaWordIndexEntry {

@ -54,6 +54,7 @@ package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.server.serverCodings;
public class plasmaWordIndexEntryContainer implements Comparable {

@ -42,7 +42,7 @@
package de.anomic.plasma;
import java.util.*;
import java.util.Iterator;
public interface plasmaWordIndexInterface {

@ -44,8 +44,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;

@ -5,15 +5,9 @@ import java.io.StringWriter;
import java.text.FieldPosition;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.apache.commons.collections.map.CaseInsensitiveMap;
public class serverSimpleLogFormatter extends SimpleFormatter {

@ -48,11 +48,10 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Map;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;

@ -47,7 +47,6 @@
package de.anomic.server;
import java.net.InetAddress;
import java.util.Enumeration;
import java.util.Iterator;
import de.anomic.server.logging.serverLog;

@ -1,7 +1,6 @@
package de.anomic.yacy.seedUpload;
import java.io.File;
import java.net.URL;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverSwitch;

@ -52,11 +52,11 @@ import java.util.Vector;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearch;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.tools.crypt;

@ -62,7 +62,6 @@ import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.GregorianCalendar;
@ -70,15 +69,12 @@ import java.util.Hashtable;
import java.util.LinkedList;
import java.util.List;
import java.util.TimeZone;
import java.util.Vector;
import de.anomic.http.httpc;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
public class yacyCore {

@ -266,6 +266,8 @@ public class yacyPeerActions {
}
} catch (java.text.ParseException e) {
ctime = yacyCore.universalTime();
} catch (java.lang.NumberFormatException e) {
ctime = yacyCore.universalTime();
}
if (Math.abs(yacyCore.universalTime() - ctime) > 3600000) {

@ -45,9 +45,9 @@ import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearch;
import de.anomic.plasma.plasmaSnippetCache;
public class yacySearch extends Thread {

@ -58,7 +58,6 @@ import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.net.ftpc;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverSwitch;

@ -64,9 +64,6 @@
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
@ -78,22 +75,23 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Enumeration;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.data.translator;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroTree;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
@ -107,7 +105,6 @@ import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyCore;
import de.anomic.data.translator;
public final class yacy {

Loading…
Cancel
Save