automatic limitation of web structure host count

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3867 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 8b0aea6910
commit dfd5e823c3

@ -36,6 +36,7 @@ import java.util.Map;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.SortedMap;
import java.util.TreeSet;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
@ -47,8 +48,9 @@ public class plasmaWebStructure {
public static int maxCRLDump = 500000;
public static int maxCRGDump = 200000;
public static int maxref = 100; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxref = 200; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 4000; // maximum number of hosts in web structure map
private StringBuffer crg; // global citation references
private serverLog log;
private File rankingPath, structureFile;
@ -63,8 +65,31 @@ public class plasmaWebStructure {
this.crg = new StringBuffer(maxCRGDump);
this.structure = new TreeMap();
this.structureFile = structureFile;
// load web structure
Map loadedStructure = serverFileUtils.loadHashMap(this.structureFile);
if (loadedStructure != null) this.structure.putAll(loadedStructure);
// delete outdated entries in case the structure is too big
if (this.structure.size() > maxhosts) {
// fill a set with last-modified - dates of the structure
TreeSet delset = new TreeSet();
Map.Entry entry;
Iterator i = this.structure.entrySet().iterator();
String key, value;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
value = (String) entry.getValue();
delset.add(value.substring(0, 8) + key);
}
int delcount = this.structure.size() - (maxhosts * 9 / 10);
i = delset.iterator();
while ((delcount > 0) && (i.hasNext())) {
this.structure.remove(((String) i.next()).substring(8));
delcount--;
}
}
}
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(URL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {

Loading…
Cancel
Save