|
|
@ -29,6 +29,7 @@ import java.util.HashSet;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Set;
|
|
|
|
import java.util.Set;
|
|
|
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
|
|
|
|
import de.anomic.kelondro.blob.BLOBHeap;
|
|
|
|
import de.anomic.kelondro.blob.BLOBHeap;
|
|
|
@ -58,7 +59,7 @@ public class CrawlProfile {
|
|
|
|
return s;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static HashMap<String, Map<String, DomProfile>> domsCache = new HashMap<String, Map<String, DomProfile>>();
|
|
|
|
static HashMap<String, ConcurrentHashMap<String, DomProfile>> domsCache = new HashMap<String, ConcurrentHashMap<String, DomProfile>>();
|
|
|
|
|
|
|
|
|
|
|
|
MapView profileTable;
|
|
|
|
MapView profileTable;
|
|
|
|
private final File profileTableFile;
|
|
|
|
private final File profileTableFile;
|
|
|
@ -278,7 +279,7 @@ public class CrawlProfile {
|
|
|
|
public static final String XPSTOPW = "xpstopw";
|
|
|
|
public static final String XPSTOPW = "xpstopw";
|
|
|
|
|
|
|
|
|
|
|
|
Map<String, String> mem;
|
|
|
|
Map<String, String> mem;
|
|
|
|
private Map<String, DomProfile> doms;
|
|
|
|
private ConcurrentHashMap<String, DomProfile> doms;
|
|
|
|
private Pattern mustmatch = null, mustnotmatch = null;
|
|
|
|
private Pattern mustmatch = null, mustnotmatch = null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -316,7 +317,7 @@ public class CrawlProfile {
|
|
|
|
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
|
|
|
|
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
|
|
|
|
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
|
|
|
|
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
|
|
|
|
|
|
|
|
|
|
|
|
doms = new HashMap<String, DomProfile>();
|
|
|
|
doms = new ConcurrentHashMap<String, DomProfile>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public String toString() {
|
|
|
|
public String toString() {
|
|
|
@ -332,7 +333,7 @@ public class CrawlProfile {
|
|
|
|
public entry(final Map<String, String> mem) {
|
|
|
|
public entry(final Map<String, String> mem) {
|
|
|
|
this.mem = mem;
|
|
|
|
this.mem = mem;
|
|
|
|
this.doms = domsCache.get(this.mem.get(HANDLE));
|
|
|
|
this.doms = domsCache.get(this.mem.get(HANDLE));
|
|
|
|
if (this.doms == null) this.doms = new HashMap<String, DomProfile>();
|
|
|
|
if (this.doms == null) this.doms = new ConcurrentHashMap<String, DomProfile>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public Map<String, String> map() {
|
|
|
|
public Map<String, String> map() {
|
|
|
@ -462,41 +463,35 @@ public class CrawlProfile {
|
|
|
|
return (r.equals(Boolean.TRUE.toString()));
|
|
|
|
return (r.equals(Boolean.TRUE.toString()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public void domInc(final String domain, final String referrer, final int depth) {
|
|
|
|
public void domInc(final String domain, final String referrer, final int depth) {
|
|
|
|
synchronized (domain.intern()) {
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
if (dp == null) {
|
|
|
|
if (dp == null) {
|
|
|
|
// new domain
|
|
|
|
// new domain
|
|
|
|
doms.put(domain, new DomProfile(referrer, depth));
|
|
|
|
doms.put(domain, new DomProfile(referrer, depth));
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// increase counter
|
|
|
|
// increase counter
|
|
|
|
dp.inc();
|
|
|
|
dp.inc();
|
|
|
|
doms.put(domain, dp);
|
|
|
|
doms.put(domain, dp);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
domsCache.put(this.mem.get(HANDLE), doms);
|
|
|
|
domsCache.put(this.mem.get(HANDLE), doms);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public boolean grantedDomAppearance(final String domain) {
|
|
|
|
public boolean grantedDomAppearance(final String domain) {
|
|
|
|
final int max = domFilterDepth();
|
|
|
|
final int max = domFilterDepth();
|
|
|
|
if (max == Integer.MAX_VALUE) return true;
|
|
|
|
if (max == Integer.MAX_VALUE) return true;
|
|
|
|
synchronized (domain.intern()) {
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
if (dp == null) {
|
|
|
|
if (dp == null) {
|
|
|
|
return 0 < max;
|
|
|
|
return 0 < max;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return dp.depth <= max;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dp.depth <= max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public boolean grantedDomCount(final String domain) {
|
|
|
|
public boolean grantedDomCount(final String domain) {
|
|
|
|
final int max = domMaxPages();
|
|
|
|
final int max = domMaxPages();
|
|
|
|
if (max == Integer.MAX_VALUE) return true;
|
|
|
|
if (max == Integer.MAX_VALUE) return true;
|
|
|
|
synchronized (domain.intern()) {
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
final DomProfile dp = doms.get(domain);
|
|
|
|
if (dp == null) {
|
|
|
|
if (dp == null) {
|
|
|
|
return 0 < max;
|
|
|
|
return 0 < max;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return dp.count <= max;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dp.count <= max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
public int domSize() {
|
|
|
|
public int domSize() {
|
|
|
|
return doms.size();
|
|
|
|
return doms.size();
|
|
|
|