- less automatic indexing after a search (needs to reset the default

crawl profiles)
- fix for concurrency problem in storage of serverSwitch Properties
- markup update
pull/1/head
Michael Christen 13 years ago
parent f62e6fb438
commit e7e429705a

@ -45,18 +45,19 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
import net.yacy.repository.RegexHelper;
public final class CrawlSwitchboard {
public final class CrawlSwitchboard
{
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -65,30 +66,28 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
private MapHeap profilesActiveCrawls;
private final Log log;
private MapHeap profilesActiveCrawls;
private final MapHeap profilesPassiveCrawls;
private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
public CrawlSwitchboard(
final String networkName,
final Log log,
final File queuesRoot) {
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) {
log.logInfo("Initializing Word Index for the network '" + networkName + "'.");
if (networkName == null || networkName.length() == 0) {
if ( networkName == null || networkName.length() == 0 ) {
log.logSevere("no network name given - shutting down");
System.exit(0);
}
this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
this.profilesActiveCrawlsCache =
Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
// make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
@ -97,84 +96,115 @@ public final class CrawlSwitchboard {
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) {
} catch ( final IOException e ) {
p = null;
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
p = null;
}
if (p == null) continue;
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
if ( p == null ) {
continue;
}
if ( !RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH)) ) {
removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
Log.logWarning("CrawlProfiles", "removed Profile "
+ p.handle()
+ ": "
+ p.name()
+ " from active crawls since "
+ CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: "
+ p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if ( !RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH)) ) {
removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
Log.logWarning("CrawlProfiles", "removed Profile "
+ p.handle()
+ ": "
+ p.name()
+ " from active crawls since "
+ CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: "
+ p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
} else {
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
}
initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
log.logInfo("Loaded active crawl profiles from file "
+ profilesActiveFile.getName()
+ ", "
+ this.profilesActiveCrawls.size()
+ " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
for ( final byte[] handle : this.profilesPassiveCrawls.keySet() ) {
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
} catch (final IOException e) {
} catch ( final IOException e ) {
continue;
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
continue;
}
}
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024);
log.logInfo("Loaded passive crawl profiles from file "
+ profilesPassiveFile.getName()
+ ", "
+ this.profilesPassiveCrawls.size()
+ " entries"
+ ", "
+ profilesPassiveFile.length()
/ 1024);
}
public CrawlProfile getActive(final byte[] profileKey) {
if (profileKey == null) return null;
if ( profileKey == null ) {
return null;
}
// get from cache
CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey);
if (p != null) return p;
if ( p != null ) {
return p;
}
// get from db
Map<String, String> m;
try {
m = this.profilesActiveCrawls.get(profileKey);
} catch (final IOException e) {
} catch ( final IOException e ) {
m = null;
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
m = null;
}
if (m == null) return null;
if ( m == null ) {
return null;
}
p = new CrawlProfile(m);
this.profilesActiveCrawlsCache.put(profileKey, p);
return p;
}
public CrawlProfile getPassive(final byte[] profileKey) {
if (profileKey == null) return null;
if ( profileKey == null ) {
return null;
}
Map<String, String> m;
try {
m = this.profilesPassiveCrawls.get(profileKey);
} catch (final IOException e) {
} catch ( final IOException e ) {
m = null;
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
m = null;
}
if (m == null) return null;
if ( m == null ) {
return null;
}
return new CrawlProfile(m);
}
@ -187,13 +217,17 @@ public final class CrawlSwitchboard {
}
public void removeActive(final byte[] profileKey) {
if (profileKey == null) return;
if ( profileKey == null ) {
return;
}
this.profilesActiveCrawlsCache.remove(profileKey);
this.profilesActiveCrawls.remove(profileKey);
}
public void removePassive(final byte[] profileKey) {
if (profileKey == null) return;
if ( profileKey == null ) {
return;
}
this.profilesPassiveCrawls.remove(profileKey);
}
@ -217,18 +251,32 @@ public final class CrawlSwitchboard {
CrawlProfile profile;
String name;
try {
for (final byte[] handle: this.profilesActiveCrawls.keySet()) {
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) this.defaultTextSnippetLocalProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile;
if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile;
if ( name.equals(CRAWL_PROFILE_PROXY) ) {
this.defaultProxyProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_REMOTE) ) {
this.defaultRemoteProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
this.defaultTextSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
this.defaultTextSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
this.defaultMediaSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
this.defaultMediaSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SURROGATE) ) {
this.defaultSurrogateProfile = profile;
}
}
} catch (final Exception e) {
} catch ( final Exception e ) {
this.profilesActiveCrawls.clear();
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
@ -239,69 +287,215 @@ public final class CrawlSwitchboard {
this.defaultSurrogateProfile = null;
}
if (this.defaultProxyProfile == null) {
if ( this.defaultProxyProfile == null ) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile(
"proxy", null,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
this.defaultProxyProfile =
new CrawlProfile(
"proxy",
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
true,
true,
true,
CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
}
if (this.defaultRemoteProfile == null) {
if ( this.defaultRemoteProfile == null ) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
this.defaultRemoteProfile =
new CrawlProfile(
CRAWL_PROFILE_REMOTE,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
CrawlProfile.MATCH_NEVER_STRING,
0,
false,
-1,
-1,
true,
true,
true,
false,
false,
true,
true,
false,
CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
if ( this.defaultTextSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
this.defaultTextSnippetLocalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
if ( this.defaultTextSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
this.defaultTextSnippetGlobalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
-1,
true,
true,
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
if ( this.defaultMediaSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
this.defaultMediaSnippetLocalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
if ( this.defaultMediaSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
this.defaultMediaSnippetGlobalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
if ( this.defaultSurrogateProfile == null ) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
this.defaultSurrogateProfile =
new CrawlProfile(
CRAWL_PROFILE_SURROGATE,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
-1,
true,
true,
false,
false,
false,
true,
true,
false,
CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);
}
}
private void resetProfiles() {
this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb);
if ( pdb.exists() ) {
FileUtils.deletedelete(pdb);
}
try {
this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch (final IOException e1) {
this.profilesActiveCrawls =
new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
@ -313,48 +507,49 @@ public final class CrawlSwitchboard {
CrawlProfile entry;
boolean hasDoneSomething = false;
try {
for (final byte[] handle: this.profilesActiveCrawls.keySet()) {
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
// check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
if ( Thread.currentThread().isInterrupted() ) {
throw new InterruptedException("Shutdown in progress");
}
// getting next profile
try {
entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) {
} catch ( final IOException e ) {
continue;
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
continue;
}
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
if ( !((entry.name().equals(CRAWL_PROFILE_PROXY))
|| (entry.name().equals(CRAWL_PROFILE_REMOTE))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name()
.equals(CRAWL_PROFILE_SURROGATE))) ) {
final CrawlProfile p = new CrawlProfile(entry);
this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p);
this.profilesActiveCrawls.remove(handle);
hasDoneSomething = true;
}
}
} catch (final kelondroException e) {
} catch ( final kelondroException e ) {
resetProfiles();
hasDoneSomething = true;
}
return hasDoneSomething;
}
public void close() {
this.profilesActiveCrawlsCache.clear();
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
}
/**
* Loads crawl profiles from a DB file.
*
* @param file DB file
* @return crawl profile data
*/
@ -362,12 +557,14 @@ public final class CrawlSwitchboard {
MapHeap ret;
try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch (final IOException e) {
Log.logException(e);Log.logException(e);
} catch ( final IOException e ) {
Log.logException(e);
Log.logException(e);
FileUtils.deletedelete(file);
try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch (final IOException e1) {
ret =
new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1);
ret = null;
}

@ -39,8 +39,8 @@ import java.util.Map;
import java.util.NavigableMap;
import java.util.Random;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@ -52,27 +52,31 @@ import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.kelondro.workflow.WorkflowThread;
import de.anomic.server.serverAccessTracker.Track;
import de.anomic.server.serverCore.Session;
public class serverSwitch {
public class serverSwitch
{
// configuration management
private final File configFile;
private final String configComment;
private final File dataPath;
protected final File appPath;
protected boolean firstInit;
protected Log log;
protected int serverJobs;
private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved;
private final ConcurrentMap<InetAddress, String> authorization;
private final NavigableMap<String, BusyThread> workerThreads;
private final serverAccessTracker accessTracker;
public serverSwitch(final File dataPath, final File appPath, final String initPath, final String configPath) {
private final File configFile;
private final String configComment;
private final File dataPath;
protected final File appPath;
protected boolean firstInit;
protected Log log;
protected int serverJobs;
private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved;
private final ConcurrentMap<InetAddress, String> authorization;
private final NavigableMap<String, BusyThread> workerThreads;
private final serverAccessTracker accessTracker;
public serverSwitch(
final File dataPath,
final File appPath,
final String initPath,
final String configPath) {
// we initialize the switchboard with a property file,
// but maintain these properties then later in a new 'config' file
// to reset all changed configs, the config file must
@ -81,47 +85,51 @@ public class serverSwitch {
// file name of the config file
this.dataPath = dataPath;
this.appPath = appPath;
this.configComment = "This is an automatically generated file, updated by serverAbstractSwitch and initialized by " + initPath;
this.configComment =
"This is an automatically generated file, updated by serverAbstractSwitch and initialized by "
+ initPath;
final File initFile = new File(appPath, initPath);
this.configFile = new File(dataPath, configPath); // propertiesFile(config);
firstInit = !configFile.exists(); // this is true if the application was started for the first time
new File(configFile.getParent()).mkdir();
this.firstInit = !this.configFile.exists(); // this is true if the application was started for the first time
new File(this.configFile.getParent()).mkdir();
// predefine init's
final ConcurrentMap<String, String> initProps;
if (initFile.exists())
if ( initFile.exists() ) {
initProps = FileUtils.loadMap(initFile);
else
} else {
initProps = new ConcurrentHashMap<String, String>();
}
// if 'pro'-version is selected, overload standard settings with 'pro'-settings
Iterator<String> i;
String prop;
// delete the 'pro' init settings
i = initProps.keySet().iterator();
while (i.hasNext()) {
while ( i.hasNext() ) {
prop = i.next();
if (prop.endsWith("__pro")) {
if ( prop.endsWith("__pro") ) {
i.remove();
}
}
// load config's from last save
if (configFile.exists())
configProps = FileUtils.loadMap(configFile);
else
configProps = new ConcurrentHashMap<String, String>();
if ( this.configFile.exists() ) {
this.configProps = FileUtils.loadMap(this.configFile);
} else {
this.configProps = new ConcurrentHashMap<String, String>();
}
// remove all values from config that do not appear in init
configRemoved = new ConcurrentHashMap<String, String>();
synchronized (configProps) {
i = configProps.keySet().iterator();
this.configRemoved = new ConcurrentHashMap<String, String>();
synchronized ( this.configProps ) {
i = this.configProps.keySet().iterator();
String key;
while (i.hasNext()) {
while ( i.hasNext() ) {
key = i.next();
if (!(initProps.containsKey(key))) {
configRemoved.put(key, this.configProps.get(key));
if ( !(initProps.containsKey(key)) ) {
this.configRemoved.put(key, this.configProps.get(key));
i.remove();
}
}
@ -132,8 +140,8 @@ public class serverSwitch {
// merge new props from init to config
// this is necessary for migration, when new properties are attached
initProps.putAll(configProps);
configProps = initProps;
initProps.putAll(this.configProps);
this.configProps = initProps;
// save result; this may initially create a config file after
// initialization
@ -141,48 +149,50 @@ public class serverSwitch {
}
// other settings
authorization = new ConcurrentHashMap<InetAddress, String>();
this.authorization = new ConcurrentHashMap<InetAddress, String>();
// init thread control
workerThreads = new TreeMap<String, BusyThread>();
this.workerThreads = new TreeMap<String, BusyThread>();
// init busy state control
serverJobs = 0;
this.serverJobs = 0;
// init server tracking
this.accessTracker = new serverAccessTracker(
getConfigLong("server.maxTrackingTime", 60 * 60 * 1000),
(int) getConfigLong("server.maxTrackingCount", 1000),
(int) getConfigLong("server.maxTrackingHostCount", 100)
);
this.accessTracker =
new serverAccessTracker(
getConfigLong("server.maxTrackingTime", 60 * 60 * 1000),
(int) getConfigLong("server.maxTrackingCount", 1000),
(int) getConfigLong("server.maxTrackingHostCount", 100));
}
public String myPublicIP() {
// if a static IP was configured, we have to return it here ...
final String staticIP = getConfig("staticIP", "");
if (staticIP.length() > 0) {
if ( staticIP.length() > 0 ) {
return staticIP;
}
// otherwise we return the real IP address of this host
final InetAddress pLIP = Domains.myPublicLocalIP();
if (pLIP != null) return pLIP.getHostAddress();
if ( pLIP != null ) {
return pLIP.getHostAddress();
}
return null;
}
// a logger for this switchboard
public void setLog(final Log log) {
this.log = log;
this.log = log;
}
public Log getLog() {
return log;
return this.log;
}
public void setConfig(final Map<String, String> otherConfigs) {
final Iterator<Map.Entry<String, String>> i = otherConfigs.entrySet().iterator();
Map.Entry<String, String> entry;
while (i.hasNext()) {
while ( i.hasNext() ) {
entry = i.next();
setConfig(entry.getKey(), entry.getValue());
}
@ -202,94 +212,99 @@ public class serverSwitch {
public void setConfig(final String key, final String value) {
// set the value
final String oldValue = configProps.put(key, value);
if (oldValue == null || !value.equals(oldValue)) saveConfig();
final String oldValue = this.configProps.put(key, value);
if ( oldValue == null || !value.equals(oldValue) ) {
saveConfig();
}
}
public void removeConfig(final String key) {
configProps.remove(key);
this.configProps.remove(key);
}
/**
* Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be
* found or if it is invalid
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public String getConfig(final String key, final String dflt) {
// get the value
final String s = configProps.get(key);
final String s = this.configProps.get(key);
// return value
if (s == null) return dflt;
if ( s == null ) {
return dflt;
}
return s;
}
/**
* Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be
* found or if it is invalid
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public long getConfigLong(final String key, final long dflt) {
try {
return Long.parseLong(getConfig(key, Long.toString(dflt)));
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
return dflt;
}
}
/**
* Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be
* found or if it is invalid
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public double getConfigFloat(final String key, final float dflt) {
try {
return Float.parseFloat(getConfig(key, Float.toString(dflt)));
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
return dflt;
}
}
/**
* Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be
* found or if it is invalid
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public int getConfigInt(final String key, final int dflt) {
try {
return Integer.parseInt(getConfig(key, Integer.toString(dflt)));
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
return dflt;
}
}
/**
* Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be
* found or if it is invalid
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public boolean getConfigBool(final String key, final boolean dflt) {
return Boolean.parseBoolean(getConfig(key, Boolean.toString(dflt)));
}
/**
* Create a File instance for a configuration setting specifying a path.
* @param key config key
* @param dflt default path value, that is used when there is no value
* <code>key</code> in the configuration.
* @return if the value of the setting is an absolute path String, then the
* returned File is derived from this setting only. Otherwise the path's file
* is constructed from the applications root path + the relative path setting.
*
* @param key config key
* @param dflt default path value, that is used when there is no value <code>key</code> in the
* configuration.
* @return if the value of the setting is an absolute path String, then the returned File is derived from
* this setting only. Otherwise the path's file is constructed from the applications root path +
* the relative path setting.
*/
public File getDataPath(final String key, final String dflt) {
File ret;
@ -298,7 +313,7 @@ public class serverSwitch {
ret = (f.isAbsolute() ? new File(f.getAbsolutePath()) : new File(this.dataPath, path));
return ret;
}
public File getAppPath(final String key, final String dflt) {
File ret;
final String path = getConfig(key, dflt).replace('\\', '/');
@ -308,265 +323,296 @@ public class serverSwitch {
}
public Iterator<String> configKeys() {
return configProps.keySet().iterator();
return this.configProps.keySet().iterator();
}
private void saveConfig() {
try {
ConcurrentMap<String, String> configPropsCopy = new ConcurrentHashMap<String, String>();
configPropsCopy.putAll(configProps); // avoid concurrency problems
FileUtils.saveMap(configFile, configPropsCopy, configComment);
} catch (final IOException e) {
log.logSevere("CONFIG: Cannot write config file " + configFile.toString() + ": " + e.getMessage(), e);
//System.out.println("ERROR: cannot write config file " + configFile.toString() + ": " + e.getMessage());
}
ConcurrentMap<String, String> configPropsCopy = new ConcurrentHashMap<String, String>();
configPropsCopy.putAll(this.configProps); // avoid concurrency problems
FileUtils.saveMap(this.configFile, configPropsCopy, this.configComment);
}
/**
* Gets configuration parameters which have been removed during initialization.
*
* @return contains parameter name as key and parameter value as value
*/
public ConcurrentMap<String, String> getRemoved() {
return configRemoved;
return this.configRemoved;
}
public void deployThread(
final String threadName,
final String threadShortDescription,
final String threadLongDescription,
final String threadMonitorURL,
final BusyThread newThread,
final long startupDelay) {
deployThread(threadName, threadShortDescription, threadLongDescription, threadMonitorURL,
newThread, startupDelay,
Long.parseLong(getConfig(threadName + "_idlesleep" , "100")),
Long.parseLong(getConfig(threadName + "_busysleep" , "1000")),
Long.parseLong(getConfig(threadName + "_memprereq" , "1000000")));
final String threadName,
final String threadShortDescription,
final String threadLongDescription,
final String threadMonitorURL,
final BusyThread newThread,
final long startupDelay) {
deployThread(
threadName,
threadShortDescription,
threadLongDescription,
threadMonitorURL,
newThread,
startupDelay,
Long.parseLong(getConfig(threadName + "_idlesleep", "100")),
Long.parseLong(getConfig(threadName + "_busysleep", "1000")),
Long.parseLong(getConfig(threadName + "_memprereq", "1000000")));
}
public void deployThread(
final String threadName,
final String threadShortDescription,
final String threadLongDescription,
final String threadMonitorURL,
final BusyThread newThread,
final long startupDelay,
final long initialIdleSleep,
final long initialBusySleep,
final long initialMemoryPreRequisite) {
if (newThread.isAlive()) throw new RuntimeException("undeployed threads must not live; they are started as part of the deployment");
final String threadName,
final String threadShortDescription,
final String threadLongDescription,
final String threadMonitorURL,
final BusyThread newThread,
final long startupDelay,
final long initialIdleSleep,
final long initialBusySleep,
final long initialMemoryPreRequisite) {
if ( newThread.isAlive() ) {
throw new RuntimeException(
"undeployed threads must not live; they are started as part of the deployment");
}
newThread.setStartupSleep(startupDelay);
long x;
try {
x = Long.parseLong(getConfig(threadName + "_idlesleep" , "novalue"));
x = Long.parseLong(getConfig(threadName + "_idlesleep", "novalue"));
newThread.setIdleSleep(x);
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
newThread.setIdleSleep(initialIdleSleep);
setConfig(threadName + "_idlesleep", initialIdleSleep);
}
try {
x = Long.parseLong(getConfig(threadName + "_busysleep" , "novalue"));
x = Long.parseLong(getConfig(threadName + "_busysleep", "novalue"));
newThread.setBusySleep(x);
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
newThread.setBusySleep(initialBusySleep);
setConfig(threadName + "_busysleep", initialBusySleep);
}
try {
x = Long.parseLong(getConfig(threadName + "_memprereq" , "novalue"));
x = Long.parseLong(getConfig(threadName + "_memprereq", "novalue"));
newThread.setMemPreReqisite(x);
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
newThread.setMemPreReqisite(initialMemoryPreRequisite);
setConfig(threadName + "_memprereq", initialMemoryPreRequisite);
}
newThread.setDescription(threadShortDescription, threadLongDescription, threadMonitorURL);
workerThreads.put(threadName, newThread);
this.workerThreads.put(threadName, newThread);
// start the thread
if (workerThreads.containsKey(threadName)) newThread.start();
if ( this.workerThreads.containsKey(threadName) ) {
newThread.start();
}
}
public BusyThread getThread(final String threadName) {
return workerThreads.get(threadName);
return this.workerThreads.get(threadName);
}
public void setThreadPerformance(final String threadName, final long idleMillis, final long busyMillis, final long memprereqBytes) {
final BusyThread thread = workerThreads.get(threadName);
if (thread != null) {
public void setThreadPerformance(
final String threadName,
final long idleMillis,
final long busyMillis,
final long memprereqBytes) {
final BusyThread thread = this.workerThreads.get(threadName);
if ( thread != null ) {
setConfig(threadName + "_idlesleep", thread.setIdleSleep(idleMillis));
setConfig(threadName + "_busysleep", thread.setBusySleep(busyMillis));
setConfig(threadName + "_memprereq", memprereqBytes);
thread.setMemPreReqisite(memprereqBytes);
}
}
public synchronized void terminateThread(final String threadName, final boolean waitFor) {
if (workerThreads.containsKey(threadName)) {
((WorkflowThread) workerThreads.get(threadName)).terminate(waitFor);
workerThreads.remove(threadName);
if ( this.workerThreads.containsKey(threadName) ) {
((WorkflowThread) this.workerThreads.get(threadName)).terminate(waitFor);
this.workerThreads.remove(threadName);
}
}
public void intermissionAllThreads(final long pause) {
final Iterator<String> e = workerThreads.keySet().iterator();
while (e.hasNext()) {
workerThreads.get(e.next()).intermission(pause);
final Iterator<String> e = this.workerThreads.keySet().iterator();
while ( e.hasNext() ) {
this.workerThreads.get(e.next()).intermission(pause);
}
}
public synchronized void terminateAllThreads(final boolean waitFor) {
Iterator<String> e = workerThreads.keySet().iterator();
while (e.hasNext()) {
((WorkflowThread) workerThreads.get(e.next())).terminate(false);
}
if (waitFor) {
e = workerThreads.keySet().iterator();
while (e.hasNext()) {
((WorkflowThread) workerThreads.get(e.next())).terminate(true);
Iterator<String> e = this.workerThreads.keySet().iterator();
while ( e.hasNext() ) {
((WorkflowThread) this.workerThreads.get(e.next())).terminate(false);
}
if ( waitFor ) {
e = this.workerThreads.keySet().iterator();
while ( e.hasNext() ) {
((WorkflowThread) this.workerThreads.get(e.next())).terminate(true);
e.remove();
}
}
}
public String[] sessionsOlderThan(String threadName, long timeout) {
final List<String> list = new ArrayList<String>();
final WorkflowThread st = getThread(threadName);
for (final Session s: ((serverCore) st).getJobList()) {
if (!s.isAlive()) continue;
if (s.getTime() > timeout) {
for ( final Session s : ((serverCore) st).getJobList() ) {
if ( !s.isAlive() ) {
continue;
}
if ( s.getTime() > timeout ) {
list.add(s.getName());
}
}
return (String[]) list.toArray();
}
public void closeSessions(String threadName, String sessionName) {
if (sessionName == null) return;
if ( sessionName == null ) {
return;
}
final WorkflowThread st = getThread(threadName);
for (final Session s: ((serverCore) st).getJobList()) {
if (
(s.isAlive()) &&
(s.getName().equals(sessionName))
) {
for ( final Session s : ((serverCore) st).getJobList() ) {
if ( (s.isAlive()) && (s.getName().equals(sessionName)) ) {
// try to stop session
s.setStopped(true);
try { Thread.sleep(100); } catch (final InterruptedException ex) {}
try {
Thread.sleep(100);
} catch ( final InterruptedException ex ) {
}
// try to interrupt session
s.interrupt();
try { Thread.sleep(100); } catch (final InterruptedException ex) {}
try {
Thread.sleep(100);
} catch ( final InterruptedException ex ) {
}
// try to close socket
if (s.isAlive()) {
if ( s.isAlive() ) {
s.close();
}
// wait for session to finish
if (s.isAlive()) {
try { s.join(500); } catch (final InterruptedException ex) {}
if ( s.isAlive() ) {
try {
s.join(500);
} catch ( final InterruptedException ex ) {
}
}
}
}
}
public Iterator<String> /*of serverThread-Names (String)*/ threadNames() {
return workerThreads.keySet().iterator();
public Iterator<String> /*of serverThread-Names (String)*/threadNames() {
return this.workerThreads.keySet().iterator();
}
// authentication routines:
public void setAuthentify(final InetAddress host, final String user, final String rights) {
// sets access attributes according to host addresses
authorization.put(host, user + "@" + rights);
this.authorization.put(host, user + "@" + rights);
}
public void removeAuthentify(final InetAddress host) {
// remove access attributes according to host addresses
authorization.remove(host);
this.authorization.remove(host);
}
public String getAuthentifyUser(final InetAddress host) {
// read user name according to host addresses
final String a = authorization.get(host);
if (a == null) return null;
final int p = a.indexOf('@');
if (p < 0) return null;
return a.substring(0, p);
// read user name according to host addresses
final String a = this.authorization.get(host);
if ( a == null ) {
return null;
}
final int p = a.indexOf('@');
if ( p < 0 ) {
return null;
}
return a.substring(0, p);
}
public String getAuthentifyRights(final InetAddress host) {
// read access rigths according to host addresses
final String a = authorization.get(host);
if (a == null) return null;
final int p = a.indexOf('@');
if (p < 0) return null;
return a.substring(p + 1);
// read access rigths according to host addresses
final String a = this.authorization.get(host);
if ( a == null ) {
return null;
}
final int p = a.indexOf('@');
if ( p < 0 ) {
return null;
}
return a.substring(p + 1);
}
public void addAuthentifyRight(final InetAddress host, final String right) {
final String rights = getAuthentifyRights(host);
if (rights == null) {
// create new authentication
setAuthentify(host, "unknown", right);
} else {
// add more authentication
final String user = getAuthentifyUser(host);
setAuthentify(host, user, rights + right);
}
final String rights = getAuthentifyRights(host);
if ( rights == null ) {
// create new authentication
setAuthentify(host, "unknown", right);
} else {
// add more authentication
final String user = getAuthentifyUser(host);
setAuthentify(host, user, rights + right);
}
}
public boolean hasAuthentifyRight(final InetAddress host, final String right) {
final String rights = getAuthentifyRights(host);
if (rights == null) return false;
return rights.indexOf(right) >= 0;
final String rights = getAuthentifyRights(host);
if ( rights == null ) {
return false;
}
return rights.indexOf(right) >= 0;
}
public File getDataPath() {
return this.dataPath;
return this.dataPath;
}
public File getAppPath() {
return this.appPath;
return this.appPath;
}
@Override
public String toString() {
return configProps.toString();
return this.configProps.toString();
}
public void handleBusyState(final int jobs) {
serverJobs = jobs;
this.serverJobs = jobs;
}
public void track(final String host, final String accessPath) {
this.accessTracker.track(host, accessPath);
}
public Collection<Track> accessTrack(final String host) {
return this.accessTracker.accessTrack(host);
}
}
public int latestAccessCount(final String host, final long timedelta) {
return this.accessTracker.latestAccessCount(host, timedelta);
}
}
public Iterator<String> accessHosts() {
return this.accessTracker.accessHosts();
}
/**
* Retrieve text data (e. g. config file) from file
* Retrieve text data (e. g. config file) from file file may be an url or a filename with path relative to
* rootPath parameter
*
* file may be an url or a filename with path relative to rootPath parameter
* @param file url or filename
* @param rootPath searchpath for file
* @param file file to use when remote fetching fails (null if unused)
*/
public Reader getConfigFileFromWebOrLocally(final String uri,
final String rootPath, final File file) throws IOException, FileNotFoundException {
if (uri.startsWith("http://") || uri.startsWith("https://")) {
public Reader getConfigFileFromWebOrLocally(final String uri, final String rootPath, final File file)
throws IOException,
FileNotFoundException {
if ( uri.startsWith("http://") || uri.startsWith("https://") ) {
final String[] uris = uri.split(",");
for (String netdef: uris) {
for ( String netdef : uris ) {
netdef = netdef.trim();
try {
final RequestHeader reqHeader = new RequestHeader();
@ -574,52 +620,57 @@ public class serverSwitch {
final HTTPClient client = new HTTPClient();
client.setHeader(reqHeader.entrySet());
byte[] data = client.GETbytes(uri);
if (data == null || data.length == 0) continue;
if ( data == null || data.length == 0 ) {
continue;
}
// save locally in case next fetch fails
if (file != null) {
FileOutputStream f = new FileOutputStream(file);
f.write(data);
f.close();
if ( file != null ) {
FileOutputStream f = new FileOutputStream(file);
f.write(data);
f.close();
}
return new InputStreamReader(new BufferedInputStream(new ByteArrayInputStream(data)));
} catch (final Exception e) {
} catch ( final Exception e ) {
continue;
}
}
if (file != null && file.exists()) {
return new FileReader(file);
if ( file != null && file.exists() ) {
return new FileReader(file);
} else {
throw new FileNotFoundException();
throw new FileNotFoundException();
}
} else {
final File f = (uri.length() > 0 && uri.startsWith("/")) ? new File(uri) : new File(rootPath, uri);
if (f.exists()) {
return new FileReader(f);
} else {
final File f =
(uri.length() > 0 && uri.startsWith("/")) ? new File(uri) : new File(rootPath, uri);
if ( f.exists() ) {
return new FileReader(f);
} else {
throw new FileNotFoundException(f.toString());
throw new FileNotFoundException(f.toString());
}
}
}
}
private static Random pwGenerator = new Random();
/**
* Generates a random password.
*
* @return random password which is 20 characters long.
*/
public String genRandomPassword() {
return genRandomPassword(20);
return genRandomPassword(20);
}
/**
* Generates a random password of a given length.
*
* @param length length o password
* @return password of given length
*/
public String genRandomPassword(final int length) {
byte[] bytes = new byte[length];
pwGenerator.nextBytes(bytes);
return Digest.encodeMD5Hex(bytes);
byte[] bytes = new byte[length];
pwGenerator.nextBytes(bytes);
return Digest.encodeMD5Hex(bytes);
}
}

File diff suppressed because it is too large Load Diff

@ -28,7 +28,6 @@
package net.yacy.peers.graphics;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
@ -64,15 +63,15 @@ import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.LookAheadIterator;
public class WebStructureGraph {
public class WebStructureGraph
{
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 50000; // maximum number of hosts in web structure map
private final static Log log = new Log("WebStructureGraph");
private final File structureFile;
private final File structureFile;
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, String> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
@ -80,9 +79,11 @@ public class WebStructureGraph {
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null);
private static class leanrefObject {
private static class leanrefObject
{
private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs;
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
this.url = url;
this.globalRefURLs = globalRefURLs;
@ -98,73 +99,92 @@ public class WebStructureGraph {
// load web structure
Map<String, String> loadedStructure;
try {
loadedStructure = (this.structureFile.exists()) ? FileUtils.loadMap(this.structureFile) : new TreeMap<String, String>();
} catch (final OutOfMemoryError e) {
loadedStructure =
(this.structureFile.exists())
? FileUtils.loadMap(this.structureFile)
: new TreeMap<String, String>();
} catch ( final OutOfMemoryError e ) {
loadedStructure = new TreeMap<String, String>();
}
if (loadedStructure != null) this.structure_old.putAll(loadedStructure);
if ( loadedStructure != null ) {
this.structure_old.putAll(loadedStructure);
}
// delete out-dated entries in case the structure is too big
if (this.structure_old.size() > maxhosts) {
// fill a set with last-modified - dates of the structure
final TreeSet<String> delset = new TreeSet<String>();
String key, value;
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) {
key = entry.getKey();
value = entry.getValue();
if (value.length() >= 8) delset.add(value.substring(0, 8) + key);
}
int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
final Iterator<String> j = delset.iterator();
while ((delcount > 0) && (j.hasNext())) {
this.structure_old.remove(j.next().substring(8));
delcount--;
}
if ( this.structure_old.size() > maxhosts ) {
// fill a set with last-modified - dates of the structure
final TreeSet<String> delset = new TreeSet<String>();
String key, value;
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
key = entry.getKey();
value = entry.getValue();
if ( value.length() >= 8 ) {
delset.add(value.substring(0, 8) + key);
}
}
int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
final Iterator<String> j = delset.iterator();
while ( (delcount > 0) && (j.hasNext()) ) {
this.structure_old.remove(j.next().substring(8));
delcount--;
}
}
this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess();
this.publicRefDNSResolvingWorker.start();
}
private class PublicRefDNSResolvingProcess extends Thread {
private class PublicRefDNSResolvingProcess extends Thread
{
private PublicRefDNSResolvingProcess() {
}
@Override
public void run() {
leanrefObject lro;
try {
while ((lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON) {
while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) {
learnrefs(lro);
}
} catch (final InterruptedException e) {
} catch ( final InterruptedException e ) {
}
}
}
public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser) {
public void generateCitationReference(
final DigestURI url,
final Document document,
final Condenser condenser) {
// generate citation reference
if (url.isLocal()) return; // we do this only for global urls
if ( url.isLocal() ) {
return; // we do this only for global urls
}
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
final String refhost = url.getHost();
MultiProtocolURI u;
int maxref = 1000;
while (it.hasNext() && maxref-- > 0) {
while ( it.hasNext() && maxref-- > 0 ) {
u = it.next();
if (u == null) continue;
if (refhost != null && u.getHost() != null && !u.getHost().equals(refhost)) {
if ( u == null ) {
continue;
}
if ( refhost != null && u.getHost() != null && !u.getHost().equals(refhost) ) {
// this is a global link
globalRefURLs.add(u);
}
}
final leanrefObject lro = new leanrefObject(url, globalRefURLs);
if (globalRefURLs.size() > 0) try {
if (this.publicRefDNSResolvingWorker.isAlive()) {
this.publicRefDNSResolvingQueue.put(lro);
} else {
if ( globalRefURLs.size() > 0 ) {
try {
if ( this.publicRefDNSResolvingWorker.isAlive() ) {
this.publicRefDNSResolvingQueue.put(lro);
} else {
learnrefs(lro);
}
} catch ( final InterruptedException e ) {
learnrefs(lro);
}
} catch (final InterruptedException e) {
learnrefs(lro);
}
}
@ -173,16 +193,22 @@ public class WebStructureGraph {
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
//final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
String nexturlhash;
for (final MultiProtocolURI u: lro.globalRefURLs) {
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
final byte[] nexturlhashb = new DigestURI(u).hash();
assert nexturlhashb != null;
if (nexturlhashb != null) {
if ( nexturlhashb != null ) {
nexturlhash = ASCII.String(nexturlhashb);
assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash;
assert nexturlhash.length() == 12 : "nexturlhash.length() = "
+ nexturlhash.length()
+ ", nexturlhash = "
+ nexturlhash;
//assert !nexturlhash.substring(6).equals(refhashp);
// this is a global link
cpg.append(nexturlhash); // store complete hash
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
assert cpg.length() % 12 == 0 : "cpg.length() = "
+ cpg.length()
+ ", cpg = "
+ cpg.toString();
}
}
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
@ -190,22 +216,26 @@ public class WebStructureGraph {
}
private static int refstr2count(final String refs) {
if ((refs == null) || (refs.length() <= 8)) return 0;
if ( (refs == null) || (refs.length() <= 8) ) {
return 0;
}
assert (refs.length() - 8) % 10 == 0 : "refs = " + refs + ", length = " + refs.length();
return (refs.length() - 8) / 10;
}
static Map<String, Integer> refstr2map(final String refs) {
if ((refs == null) || (refs.length() <= 8)) return new HashMap<String, Integer>();
if ( (refs == null) || (refs.length() <= 8) ) {
return new HashMap<String, Integer>();
}
final Map<String, Integer> map = new HashMap<String, Integer>();
String c;
final int refsc = refstr2count(refs);
int d;
for (int i = 0; i < refsc; i++) {
for ( int i = 0; i < refsc; i++ ) {
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
try {
d = Integer.valueOf(c.substring(6), 16);
} catch (final NumberFormatException e) {
} catch ( final NumberFormatException e ) {
d = 1;
}
map.put(c.substring(0, 6), d);
@ -217,19 +247,19 @@ public class WebStructureGraph {
final StringBuilder s = new StringBuilder(map.size() * 10);
s.append(GenericFormatter.SHORT_DAY_FORMATTER.format());
String h;
for (final Map.Entry<String, Integer> entry : map.entrySet()) {
for ( final Map.Entry<String, Integer> entry : map.entrySet() ) {
s.append(entry.getKey());
h = Integer.toHexString(entry.getValue().intValue());
final int hl = h.length();
if (hl == 0) {
if ( hl == 0 ) {
s.append("0000");
} else if (hl == 1) {
} else if ( hl == 1 ) {
s.append("000").append(h);
} else if (hl == 2) {
} else if ( hl == 2 ) {
s.append("00").append(h);
} else if (hl == 3) {
} else if ( hl == 3 ) {
s.append('0').append(h);
} else if (hl == 4) {
} else if ( hl == 4 ) {
s.append(h);
} else {
s.append("FFFF");
@ -246,11 +276,11 @@ public class WebStructureGraph {
String hostname = "";
String date = "";
String ref;
synchronized (this.structure_old) {
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
hostname = key.substring(7);
ref = tailMap.get(key);
date = ref.substring(0, 8);
@ -258,68 +288,87 @@ public class WebStructureGraph {
}
}
}
synchronized (this.structure_new) {
synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
ref = tailMap.get(key);
if (hostname.length() == 0) hostname = key.substring(7);
if (date.length() == 0) date = ref.substring(0, 8);
if ( hostname.length() == 0 ) {
hostname = key.substring(7);
}
if ( date.length() == 0 ) {
date = ref.substring(0, 8);
}
h.putAll(refstr2map(ref));
}
}
}
if (h.isEmpty()) return null;
if ( h.isEmpty() ) {
return null;
}
return new StructureEntry(hosthash, hostname, date, h);
}
public StructureEntry incomingReferences(final String hosthash) {
final String hostname = hostHash2hostName(hosthash);
if (hostname == null) return null;
if ( hostname == null ) {
return null;
}
// collect the references
WebStructureGraph.StructureEntry sentry;
final HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false);
while (i.hasNext()) {
while ( i.hasNext() ) {
sentry = i.next();
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
if ( sentry.references.containsKey(hosthash) ) {
hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
}
i = new StructureIterator(true);
while (i.hasNext()) {
while ( i.hasNext() ) {
sentry = i.next();
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
if ( sentry.references.containsKey(hosthash) ) {
hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
}
// construct a new structureEntry Object
return new StructureEntry(
hosthash,
hostname,
GenericFormatter.SHORT_DAY_FORMATTER.format(),
hosthashes);
hosthash,
hostname,
GenericFormatter.SHORT_DAY_FORMATTER.format(),
hosthashes);
}
public static class HostReferenceFactory implements ReferenceFactory<HostReference> {
public static class HostReferenceFactory implements ReferenceFactory<HostReference>
{
private static final Row hostReferenceRow = new Row("String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}", Base64Order.enhancedCoder);
private static final Row hostReferenceRow = new Row(
"String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}",
Base64Order.enhancedCoder);
public HostReferenceFactory() {
}
@Override
public Row getRow() {
return hostReferenceRow;
}
@Override
public HostReference produceSlow(final Entry e) {
return new HostReference(e);
}
@Override
public HostReference produceFast(final HostReference e) {
return e;
}
}
public static class HostReference extends AbstractReference implements Reference {
public static class HostReference extends AbstractReference implements Reference
{
private final Row.Entry entry;
@ -339,14 +388,17 @@ public class WebStructureGraph {
this.entry = entry;
}
@Override
public String toPropertyForm() {
return this.entry.toPropertyForm(':', true, true, false, true);
}
@Override
public Entry toKelondroEntry() {
return this.entry;
}
@Override
public byte[] urlhash() {
return this.entry.getPrimaryKeyBytes();
}
@ -355,40 +407,50 @@ public class WebStructureGraph {
return (int) this.entry.getColLong(2);
}
@Override
public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1));
}
@Override
public void join(final Reference r) {
// joins two entries into one entry
final HostReference oe = (HostReference) r;
// combine date
final long o = oe.lastModified();
if (lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o));
if ( lastModified() < o ) {
this.entry.setCol(1, MicroDate.microDateDays(o));
}
// combine count
final int c = oe.count();
if (count() < c) this.entry.setCol(2, c);
if ( count() < c ) {
this.entry.setCol(2, c);
}
}
@Override
public Collection<Integer> positions() {
return new ArrayList<Integer>(0);
}
}
public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0;
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0;
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
// we return a cache if the cache is filled and not stale
if (hostReferenceIndexCache != null &&
hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis()) return hostReferenceIndexCache;
if ( hostReferenceIndexCache != null
&& hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis() ) {
return hostReferenceIndexCache;
}
// collect the references
final ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
final ReferenceContainerCache<HostReference> idx =
new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
// we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts
@ -403,40 +465,47 @@ public class WebStructureGraph {
}
private void incomingReferencesEnrich(
final ReferenceContainerCache<HostReference> idx,
final Iterator<WebStructureGraph.StructureEntry> structureIterator,
final long time) {
final ReferenceContainerCache<HostReference> idx,
final Iterator<WebStructureGraph.StructureEntry> structureIterator,
final long time) {
// we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts
final long timeout = System.currentTimeMillis() + time;
byte[] term;
HostReference hr;
WebStructureGraph.StructureEntry sentry;
structureLoop: while (structureIterator.hasNext()) {
structureLoop: while ( structureIterator.hasNext() ) {
sentry = structureIterator.next();
// then we loop over all the hosts that are linked from sentry.hosthash
refloop: for (final Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) {
refloop: for ( final Map.Entry<String, Integer> refhosthashandcounter : sentry.references
.entrySet() ) {
term = UTF8.getBytes(refhosthashandcounter.getKey());
try {
hr = new HostReference(ASCII.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue());
} catch (final ParseException e) {
hr =
new HostReference(
ASCII.getBytes(sentry.hosthash),
GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(),
refhosthashandcounter.getValue().intValue());
} catch ( final ParseException e ) {
continue refloop;
}
// each term refers to an index entry. look if we already have such an entry
ReferenceContainer<HostReference> r = idx.get(term, null);
try {
if (r == null) {
if ( r == null ) {
r = new ReferenceContainer<HostReference>(hostReferenceFactory, term);
r.add(hr);
idx.add(r);
} else {
r.put(hr);
}
} catch (final RowSpaceExceededException e) {
} catch ( final RowSpaceExceededException e ) {
continue refloop;
}
}
if (System.currentTimeMillis() > timeout) break structureLoop;
if ( System.currentTimeMillis() > timeout ) {
break structureLoop;
}
}
}
@ -459,23 +528,25 @@ public class WebStructureGraph {
public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash;
if (hosthash == null || hosthash.length() != 6) return 0;
if ( hosthash == null || hosthash.length() != 6 ) {
return 0;
}
SortedMap<String, String> tailMap;
int c = 0;
synchronized (this.structure_old) {
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
c = refstr2count(tailMap.get(key));
}
}
}
synchronized (this.structure_new) {
synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
c += refstr2count(tailMap.get(key));
}
}
@ -487,20 +558,20 @@ public class WebStructureGraph {
// returns the host as string, null if unknown
assert hosthash.length() == 6;
SortedMap<String, String> tailMap;
synchronized(this.structure_old) {
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
return key.substring(7);
}
}
}
synchronized(this.structure_new) {
synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) {
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) {
if ( key.startsWith(hosthash) ) {
return key.substring(7);
}
}
@ -513,53 +584,61 @@ public class WebStructureGraph {
// parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(hosthash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString();
final Map<String, Integer> refs =
(structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = "
+ reference.length()
+ ", reference = "
+ reference.toString();
String dom;
int c;
for (int i = 0; i < reference.length() / 12; i++) {
for ( int i = 0; i < reference.length() / 12; i++ ) {
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
c = 0;
if (refs.containsKey(dom)) {
if ( refs.containsKey(dom) ) {
c = (refs.get(dom)).intValue();
}
refs.put(dom, Integer.valueOf(++c));
}
// check if the maxref is exceeded
if (refs.size() > maxref) {
if ( refs.size() > maxref ) {
int shrink = refs.size() - (maxref * 9 / 10);
delloop: while (shrink > 0) {
delloop: while ( shrink > 0 ) {
// shrink the references: the entry with the smallest number of references is removed
int minrefcount = Integer.MAX_VALUE;
String minrefkey = null;
findloop: for (final Map.Entry<String, Integer> entry : refs.entrySet()) {
if (entry.getValue().intValue() < minrefcount) {
findloop: for ( final Map.Entry<String, Integer> entry : refs.entrySet() ) {
if ( entry.getValue().intValue() < minrefcount ) {
minrefcount = entry.getValue().intValue();
minrefkey = entry.getKey();
}
if (minrefcount == 1) break findloop;
if ( minrefcount == 1 ) {
break findloop;
}
}
// remove the smallest
if (minrefkey == null) break delloop;
if ( minrefkey == null ) {
break delloop;
}
refs.remove(minrefkey);
shrink--;
}
}
// store the map back to the structure
synchronized(this.structure_new) {
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
}
}
private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) {
for (final Map.Entry<String, String> e: from.entrySet()) {
if (into.containsKey(e.getKey())) {
for ( final Map.Entry<String, String> e : from.entrySet() ) {
if ( into.containsKey(e.getKey()) ) {
final Map<String, Integer> s0 = refstr2map(into.get(e.getKey()));
final Map<String, Integer> s1 = refstr2map(e.getValue());
for (final Map.Entry<String, Integer> r: s1.entrySet()) {
if (s0.containsKey(r.getKey())) {
for ( final Map.Entry<String, Integer> r : s1.entrySet() ) {
if ( s0.containsKey(r.getKey()) ) {
s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue());
} else {
s0.put(r.getKey(), r.getValue().intValue());
@ -573,7 +652,7 @@ public class WebStructureGraph {
}
public void joinOldNew() {
synchronized(this.structure_new) {
synchronized ( this.structure_new ) {
joinStructure(this.structure_old, this.structure_new);
this.structure_new.clear();
}
@ -584,10 +663,10 @@ public class WebStructureGraph {
String maxhost = null;
int refsize, maxref = 0;
joinOldNew();
synchronized(this.structure_new) {
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) {
synchronized ( this.structure_new ) {
for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
refsize = entry.getValue().length();
if (refsize > maxref) {
if ( refsize > maxref ) {
maxref = refsize;
maxhost = entry.getKey().substring(7);
}
@ -600,41 +679,59 @@ public class WebStructureGraph {
return new StructureIterator(latest);
}
private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> {
private class StructureIterator extends LookAheadIterator<StructureEntry> implements
Iterator<StructureEntry>
{
private final Iterator<Map.Entry<String, String>> i;
private StructureIterator(final boolean latest) {
this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator();
this.i =
((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old)
.entrySet()
.iterator();
}
@Override
public StructureEntry next0() {
Map.Entry<String, String> entry = null;
String dom = null, ref = "";
while (this.i.hasNext()) {
while ( this.i.hasNext() ) {
entry = this.i.next();
ref = entry.getValue();
if ((ref.length() - 8) % 10 != 0) continue;
if ( (ref.length() - 8) % 10 != 0 ) {
continue;
}
dom = entry.getKey();
if (dom.length() >= 8) break;
if ( dom.length() >= 8 ) {
break;
}
dom = null;
}
if (entry == null || dom == null) return null;
if ( entry == null || dom == null ) {
return null;
}
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length();
return new StructureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
return new StructureEntry(
dom.substring(0, 6),
dom.substring(7),
ref.substring(0, 8),
refstr2map(ref));
}
}
public static class StructureEntry {
public static class StructureEntry
{
public String hosthash; // the tail of the host hash
public String hostname; // the host name
public String date; // date of latest change
public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
private StructureEntry(
final String hosthash,
final String hostname,
final String date,
final Map<String, Integer> references) {
final String hosthash,
final String hostname,
final String date,
final Map<String, Integer> references) {
this.hosthash = hosthash;
this.hostname = hostname;
this.date = date;
@ -644,30 +741,42 @@ public class WebStructureGraph {
public void close() {
// finish dns resolving queue
if (this.publicRefDNSResolvingWorker.isAlive()) {
if ( this.publicRefDNSResolvingWorker.isAlive() ) {
log.logInfo("Waiting for the DNS Resolving Queue to terminate");
try {
this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON);
this.publicRefDNSResolvingWorker.join(5000);
} catch (final InterruptedException e) {
} catch ( final InterruptedException e ) {
}
}
// save to web structure file
log.logInfo("Saving Web Structure File: new = " + this.structure_new.size() + " entries, old = " + this.structure_old.size() + " entries");
log.logInfo("Saving Web Structure File: new = "
+ this.structure_new.size()
+ " entries, old = "
+ this.structure_old.size()
+ " entries");
final long time = System.currentTimeMillis();
joinOldNew();
if (this.structure_old.size() > 0) try {
synchronized(this.structure_old) {
if (this.structure_old.size() > 0) {
FileUtils.saveMap(this.structureFile, this.structure_old, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
if ( this.structure_old.size() > 0 ) {
synchronized ( this.structure_old ) {
if ( this.structure_old.size() > 0 ) {
FileUtils
.saveMap(
this.structureFile,
this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
final long t = Math.max(1, System.currentTimeMillis() - time);
log.logInfo("Saved Web Structure File: " + this.structure_old.size() + " entries in " + t + " milliseconds, " + (this.structure_old.size() * 1000 / t) + " entries/second");
log.logInfo("Saved Web Structure File: "
+ this.structure_old.size()
+ " entries in "
+ t
+ " milliseconds, "
+ (this.structure_old.size() * 1000 / t)
+ " entries/second");
}
this.structure_old.clear();
}
} catch (final IOException e) {
Log.logException(e);
}
}
}

Loading…
Cancel
Save