fixed missing remove operation in balancer

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4990 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 606b323a2d
commit 05c26d58d9

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5
# Release Configuration
releaseVersion=0.592
releaseVersion=0.593
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

@ -241,6 +241,20 @@ public class Balancer {
if (urlHashes.contains(h)) j.remove();
}
// iterate through the domain stacks
Iterator<Map.Entry<String, LinkedList<String>>> k = domainStacks.entrySet().iterator();
Map.Entry<String, LinkedList<String>> se;
LinkedList<String> stack;
while (k.hasNext()) {
se = k.next();
stack = se.getValue();
i = stack.iterator();
while (i.hasNext()) {
if (urlHashes.contains(i.next())) i.remove();
}
if (stack.size() == 0) k.remove();
}
return removedCounter;
}
@ -256,13 +270,15 @@ public class Balancer {
public int size() {
int componentsize = urlFileIndex.size();
/*
assert componentsize == urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks() :
"size wrong in " + stackname +
" - urlFileIndex = " + urlFileIndex.size() +
", componentsize = " + componentsize +
", componentsize = " + urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks() +
" = (urlFileStack = " + urlFileStack.size() +
", urlRAMStack = " + urlRAMStack.size() +
", sizeDomainStacks = " + sizeDomainStacks() + ")";
*/
return componentsize;
}

@ -56,6 +56,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.http.HttpClient;
import de.anomic.http.JakartaCommonsHttpClient;
@ -78,6 +79,7 @@ public class RobotsTxt {
kelondroMap robotsTable;
private final File robotsTableFile;
private ConcurrentHashMap<String, Long> syncObjects;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
public RobotsTxt(File robotsTableFile) {
@ -94,6 +96,7 @@ public class RobotsTxt {
blob = new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
}
robotsTable = new kelondroMap(blob, 100);
syncObjects = new ConcurrentHashMap<String, Long>();
}
private void resetDatabase() {
@ -133,86 +136,98 @@ public class RobotsTxt {
robotsTxt4Host == null ||
robotsTxt4Host.getLoadedDate() == null ||
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000
)) synchronized(this) {
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
)) {
// check the robots table again for all threads that come here because they waited for another one
// to complete a download
try {
HashMap<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
} catch (kelondroException e) {
resetDatabase();
} catch (IOException e) {
resetDatabase();
}
if (robotsTxt4Host != null &&
robotsTxt4Host.getLoadedDate() != null &&
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 7*24*60*60*1000) {
return robotsTxt4Host;
// make or get a synchronization object
Long syncObj = this.syncObjects.get(urlHostPort);
if (syncObj == null) {
syncObj = new Long(System.currentTimeMillis());
this.syncObjects.put(urlHostPort, syncObj);
}
// generating the proper url to download the robots txt
yacyURL robotsURL = null;
try {
robotsURL = new yacyURL("http://" + urlHostPort + "/robots.txt", null);
} catch (MalformedURLException e) {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.");
robotsURL = null;
}
Object[] result = null;
if (robotsURL != null) {
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
// we can now synchronize for each host separately
synchronized (syncObj) {
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
// check the robots table again for all threads that come here because they waited for another one
// to complete a download
try {
result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
} catch (Exception e) {
result = null;
HashMap<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
} catch (kelondroException e) {
resetDatabase();
} catch (IOException e) {
resetDatabase();
}
}
/*
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
"robots-url=" + robotsURL.toString() +
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
loadedRobots.add(robotsURL.toNormalform(false, false));
*/
if (result == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new Entry(
urlHostPort,
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
new Integer(0));
} else {
robotsTxt4Host.setLoadedDate(new Date());
if (robotsTxt4Host != null &&
robotsTxt4Host.getLoadedDate() != null &&
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 7*24*60*60*1000) {
return robotsTxt4Host;
}
// store the data into the robots DB
addEntry(robotsTxt4Host);
} else {
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
denyPath.add("/");
// generating the proper url to download the robots txt
yacyURL robotsURL = null;
try {
robotsURL = new yacyURL("http://" + urlHostPort + "/robots.txt", null);
} catch (MalformedURLException e) {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.");
robotsURL = null;
}
Object[] result = null;
if (robotsURL != null) {
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
try {
result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
} catch (Exception e) {
result = null;
}
}
/*
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
"robots-url=" + robotsURL.toString() +
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
loadedRobots.add(robotsURL.toNormalform(false, false));
*/
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
(String) parserResult[1],
(Integer) parserResult[2]);
if (result == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new Entry(
urlHostPort,
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
new Integer(0));
} else {
robotsTxt4Host.setLoadedDate(new Date());
}
// store the data into the robots DB
addEntry(robotsTxt4Host);
} else {
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
denyPath.add("/");
}
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
(String) parserResult[1],
(Integer) parserResult[2]);
}
}
}

Loading…
Cancel
Save