- less automatic indexing after a search (needs to reset the default

crawl profiles)
- fix for concurrency problem in storage of serverSwitch Properties
- markup update
pull/1/head
Michael Christen 13 years ago
parent f62e6fb438
commit e7e429705a

@ -45,18 +45,19 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException; import net.yacy.kelondro.util.kelondroException;
import net.yacy.repository.RegexHelper; import net.yacy.repository.RegexHelper;
public final class CrawlSwitchboard { public final class CrawlSwitchboard
{
public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -65,30 +66,28 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log; private final Log log;
private MapHeap profilesActiveCrawls; private MapHeap profilesActiveCrawls;
private final MapHeap profilesPassiveCrawls; private final MapHeap profilesPassiveCrawls;
private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder); private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
public CrawlProfile defaultProxyProfile; public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile; public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot; private final File queuesRoot;
public CrawlSwitchboard( public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) {
final String networkName,
final Log log,
final File queuesRoot) {
log.logInfo("Initializing Word Index for the network '" + networkName + "'."); log.logInfo("Initializing Word Index for the network '" + networkName + "'.");
if (networkName == null || networkName.length() == 0) { if ( networkName == null || networkName.length() == 0 ) {
log.logSevere("no network name given - shutting down"); log.logSevere("no network name given - shutting down");
System.exit(0); System.exit(0);
} }
this.log = log; this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder)); this.profilesActiveCrawlsCache =
Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
// make crawl profiles database and default profiles // make crawl profiles database and default profiles
this.queuesRoot = queuesRoot; this.queuesRoot = queuesRoot;
@ -97,84 +96,115 @@ public final class CrawlSwitchboard {
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = loadFromDB(profilesActiveFile); this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
for (final byte[] handle : this.profilesActiveCrawls.keySet()) { for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
CrawlProfile p; CrawlProfile p;
try { try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) { } catch ( final IOException e ) {
p = null; p = null;
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
p = null; p = null;
} }
if (p == null) continue; if ( p == null ) {
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) { continue;
}
if ( !RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH)) ) {
removeActive(handle); removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() Log.logWarning("CrawlProfiles", "removed Profile "
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH + p.handle()
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH)); + ": "
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) { + p.name()
+ " from active crawls since "
+ CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: "
+ p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if ( !RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH)) ) {
removeActive(handle); removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() Log.logWarning("CrawlProfiles", "removed Profile "
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH + p.handle()
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH)); + ": "
+ p.name()
+ " from active crawls since "
+ CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: "
+ p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
} else { } else {
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
} }
} }
initActiveCrawlProfiles(); initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); log.logInfo("Loaded active crawl profiles from file "
+ profilesActiveFile.getName()
+ ", "
+ this.profilesActiveCrawls.size()
+ " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile); this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
for (final byte[] handle : this.profilesPassiveCrawls.keySet()) { for ( final byte[] handle : this.profilesPassiveCrawls.keySet() ) {
CrawlProfile p; CrawlProfile p;
try { try {
p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
} catch (final IOException e) { } catch ( final IOException e ) {
continue; continue;
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
continue; continue;
} }
} }
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + log.logInfo("Loaded passive crawl profiles from file "
", " + this.profilesPassiveCrawls.size() + " entries" + + profilesPassiveFile.getName()
", " + profilesPassiveFile.length()/1024); + ", "
+ this.profilesPassiveCrawls.size()
+ " entries"
+ ", "
+ profilesPassiveFile.length()
/ 1024);
} }
public CrawlProfile getActive(final byte[] profileKey) { public CrawlProfile getActive(final byte[] profileKey) {
if (profileKey == null) return null; if ( profileKey == null ) {
return null;
}
// get from cache // get from cache
CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey); CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey);
if (p != null) return p; if ( p != null ) {
return p;
}
// get from db // get from db
Map<String, String> m; Map<String, String> m;
try { try {
m = this.profilesActiveCrawls.get(profileKey); m = this.profilesActiveCrawls.get(profileKey);
} catch (final IOException e) { } catch ( final IOException e ) {
m = null; m = null;
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
m = null; m = null;
} }
if (m == null) return null; if ( m == null ) {
return null;
}
p = new CrawlProfile(m); p = new CrawlProfile(m);
this.profilesActiveCrawlsCache.put(profileKey, p); this.profilesActiveCrawlsCache.put(profileKey, p);
return p; return p;
} }
public CrawlProfile getPassive(final byte[] profileKey) { public CrawlProfile getPassive(final byte[] profileKey) {
if (profileKey == null) return null; if ( profileKey == null ) {
return null;
}
Map<String, String> m; Map<String, String> m;
try { try {
m = this.profilesPassiveCrawls.get(profileKey); m = this.profilesPassiveCrawls.get(profileKey);
} catch (final IOException e) { } catch ( final IOException e ) {
m = null; m = null;
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
m = null; m = null;
} }
if (m == null) return null; if ( m == null ) {
return null;
}
return new CrawlProfile(m); return new CrawlProfile(m);
} }
@ -187,13 +217,17 @@ public final class CrawlSwitchboard {
} }
public void removeActive(final byte[] profileKey) { public void removeActive(final byte[] profileKey) {
if (profileKey == null) return; if ( profileKey == null ) {
return;
}
this.profilesActiveCrawlsCache.remove(profileKey); this.profilesActiveCrawlsCache.remove(profileKey);
this.profilesActiveCrawls.remove(profileKey); this.profilesActiveCrawls.remove(profileKey);
} }
public void removePassive(final byte[] profileKey) { public void removePassive(final byte[] profileKey) {
if (profileKey == null) return; if ( profileKey == null ) {
return;
}
this.profilesPassiveCrawls.remove(profileKey); this.profilesPassiveCrawls.remove(profileKey);
} }
@ -217,18 +251,32 @@ public final class CrawlSwitchboard {
CrawlProfile profile; CrawlProfile profile;
String name; String name;
try { try {
for (final byte[] handle: this.profilesActiveCrawls.keySet()) { for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle)); profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name(); name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile; if ( name.equals(CRAWL_PROFILE_PROXY) ) {
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile; this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) this.defaultTextSnippetLocalProfile = profile; }
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile; if ( name.equals(CRAWL_PROFILE_REMOTE) ) {
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile; this.defaultRemoteProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile; }
if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile; if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
this.defaultTextSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
this.defaultTextSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
this.defaultMediaSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
this.defaultMediaSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SURROGATE) ) {
this.defaultSurrogateProfile = profile;
}
} }
} catch (final Exception e) { } catch ( final Exception e ) {
this.profilesActiveCrawls.clear(); this.profilesActiveCrawls.clear();
this.defaultProxyProfile = null; this.defaultProxyProfile = null;
this.defaultRemoteProfile = null; this.defaultRemoteProfile = null;
@ -239,69 +287,215 @@ public final class CrawlSwitchboard {
this.defaultSurrogateProfile = null; this.defaultSurrogateProfile = null;
} }
if (this.defaultProxyProfile == null) { if ( this.defaultProxyProfile == null ) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile( this.defaultProxyProfile =
"proxy", null, new CrawlProfile(
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "proxy",
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"", "",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
true,
true,
true,
CacheStrategy.IFFRESH); CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
} }
if (this.defaultRemoteProfile == null) { if ( this.defaultRemoteProfile == null ) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true, this.defaultRemoteProfile =
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); CRAWL_PROFILE_REMOTE,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
CrawlProfile.MATCH_NEVER_STRING,
0,
false,
-1,
-1,
true,
true,
true,
false,
false,
true,
true,
false,
CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if ( this.defaultTextSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, this.defaultTextSnippetLocalProfile =
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if ( this.defaultTextSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, this.defaultTextSnippetGlobalProfile =
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
-1,
true,
true,
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if ( this.defaultMediaSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, this.defaultMediaSnippetLocalProfile =
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if ( this.defaultMediaSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, this.defaultMediaSnippetGlobalProfile =
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if ( this.defaultSurrogateProfile == null ) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false, this.defaultSurrogateProfile =
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); new CrawlProfile(
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); CRAWL_PROFILE_SURROGATE,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
-1,
true,
true,
false,
false,
false,
true,
true,
false,
CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);
} }
} }
private void resetProfiles() { private void resetProfiles() {
this.profilesActiveCrawlsCache.clear(); this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb); if ( pdb.exists() ) {
FileUtils.deletedelete(pdb);
}
try { try {
this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); this.profilesActiveCrawls =
} catch (final IOException e1) { new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1); Log.logException(e1);
this.profilesActiveCrawls = null; this.profilesActiveCrawls = null;
} }
@ -313,48 +507,49 @@ public final class CrawlSwitchboard {
CrawlProfile entry; CrawlProfile entry;
boolean hasDoneSomething = false; boolean hasDoneSomething = false;
try { try {
for (final byte[] handle: this.profilesActiveCrawls.keySet()) { for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
// check for interruption // check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); if ( Thread.currentThread().isInterrupted() ) {
throw new InterruptedException("Shutdown in progress");
}
// getting next profile // getting next profile
try { try {
entry = new CrawlProfile(this.profilesActiveCrawls.get(handle)); entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) { } catch ( final IOException e ) {
continue; continue;
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
continue; continue;
} }
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || if ( !((entry.name().equals(CRAWL_PROFILE_PROXY))
(entry.name().equals(CRAWL_PROFILE_REMOTE)) || || (entry.name().equals(CRAWL_PROFILE_REMOTE))
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) || || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) || || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name()
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) { .equals(CRAWL_PROFILE_SURROGATE))) ) {
final CrawlProfile p = new CrawlProfile(entry); final CrawlProfile p = new CrawlProfile(entry);
this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p); this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p);
this.profilesActiveCrawls.remove(handle); this.profilesActiveCrawls.remove(handle);
hasDoneSomething = true; hasDoneSomething = true;
} }
} }
} catch (final kelondroException e) { } catch ( final kelondroException e ) {
resetProfiles(); resetProfiles();
hasDoneSomething = true; hasDoneSomething = true;
} }
return hasDoneSomething; return hasDoneSomething;
} }
public void close() { public void close() {
this.profilesActiveCrawlsCache.clear(); this.profilesActiveCrawlsCache.clear();
this.profilesActiveCrawls.close(); this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close(); this.profilesPassiveCrawls.close();
} }
/** /**
* Loads crawl profiles from a DB file. * Loads crawl profiles from a DB file.
*
* @param file DB file * @param file DB file
* @return crawl profile data * @return crawl profile data
*/ */
@ -362,12 +557,14 @@ public final class CrawlSwitchboard {
MapHeap ret; MapHeap ret;
try { try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch (final IOException e) { } catch ( final IOException e ) {
Log.logException(e);Log.logException(e); Log.logException(e);
Log.logException(e);
FileUtils.deletedelete(file); FileUtils.deletedelete(file);
try { try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); ret =
} catch (final IOException e1) { new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1); Log.logException(e1);
ret = null; ret = null;
} }

@ -39,8 +39,8 @@ import java.util.Map;
import java.util.NavigableMap; import java.util.NavigableMap;
import java.util.Random; import java.util.Random;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
@ -52,27 +52,31 @@ import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.kelondro.workflow.WorkflowThread;
import de.anomic.server.serverAccessTracker.Track; import de.anomic.server.serverAccessTracker.Track;
import de.anomic.server.serverCore.Session; import de.anomic.server.serverCore.Session;
public class serverSwitch { public class serverSwitch
{
// configuration management // configuration management
private final File configFile; private final File configFile;
private final String configComment; private final String configComment;
private final File dataPath; private final File dataPath;
protected final File appPath; protected final File appPath;
protected boolean firstInit; protected boolean firstInit;
protected Log log; protected Log log;
protected int serverJobs; protected int serverJobs;
private ConcurrentMap<String, String> configProps; private ConcurrentMap<String, String> configProps;
private final ConcurrentMap<String, String> configRemoved; private final ConcurrentMap<String, String> configRemoved;
private final ConcurrentMap<InetAddress, String> authorization; private final ConcurrentMap<InetAddress, String> authorization;
private final NavigableMap<String, BusyThread> workerThreads; private final NavigableMap<String, BusyThread> workerThreads;
private final serverAccessTracker accessTracker; private final serverAccessTracker accessTracker;
public serverSwitch(final File dataPath, final File appPath, final String initPath, final String configPath) { public serverSwitch(
final File dataPath,
final File appPath,
final String initPath,
final String configPath) {
// we initialize the switchboard with a property file, // we initialize the switchboard with a property file,
// but maintain these properties then later in a new 'config' file // but maintain these properties then later in a new 'config' file
// to reset all changed configs, the config file must // to reset all changed configs, the config file must
@ -81,47 +85,51 @@ public class serverSwitch {
// file name of the config file // file name of the config file
this.dataPath = dataPath; this.dataPath = dataPath;
this.appPath = appPath; this.appPath = appPath;
this.configComment = "This is an automatically generated file, updated by serverAbstractSwitch and initialized by " + initPath; this.configComment =
"This is an automatically generated file, updated by serverAbstractSwitch and initialized by "
+ initPath;
final File initFile = new File(appPath, initPath); final File initFile = new File(appPath, initPath);
this.configFile = new File(dataPath, configPath); // propertiesFile(config); this.configFile = new File(dataPath, configPath); // propertiesFile(config);
firstInit = !configFile.exists(); // this is true if the application was started for the first time this.firstInit = !this.configFile.exists(); // this is true if the application was started for the first time
new File(configFile.getParent()).mkdir(); new File(this.configFile.getParent()).mkdir();
// predefine init's // predefine init's
final ConcurrentMap<String, String> initProps; final ConcurrentMap<String, String> initProps;
if (initFile.exists()) if ( initFile.exists() ) {
initProps = FileUtils.loadMap(initFile); initProps = FileUtils.loadMap(initFile);
else } else {
initProps = new ConcurrentHashMap<String, String>(); initProps = new ConcurrentHashMap<String, String>();
}
// if 'pro'-version is selected, overload standard settings with 'pro'-settings // if 'pro'-version is selected, overload standard settings with 'pro'-settings
Iterator<String> i; Iterator<String> i;
String prop; String prop;
// delete the 'pro' init settings // delete the 'pro' init settings
i = initProps.keySet().iterator(); i = initProps.keySet().iterator();
while (i.hasNext()) { while ( i.hasNext() ) {
prop = i.next(); prop = i.next();
if (prop.endsWith("__pro")) { if ( prop.endsWith("__pro") ) {
i.remove(); i.remove();
} }
} }
// load config's from last save // load config's from last save
if (configFile.exists()) if ( this.configFile.exists() ) {
configProps = FileUtils.loadMap(configFile); this.configProps = FileUtils.loadMap(this.configFile);
else } else {
configProps = new ConcurrentHashMap<String, String>(); this.configProps = new ConcurrentHashMap<String, String>();
}
// remove all values from config that do not appear in init // remove all values from config that do not appear in init
configRemoved = new ConcurrentHashMap<String, String>(); this.configRemoved = new ConcurrentHashMap<String, String>();
synchronized (configProps) { synchronized ( this.configProps ) {
i = configProps.keySet().iterator(); i = this.configProps.keySet().iterator();
String key; String key;
while (i.hasNext()) { while ( i.hasNext() ) {
key = i.next(); key = i.next();
if (!(initProps.containsKey(key))) { if ( !(initProps.containsKey(key)) ) {
configRemoved.put(key, this.configProps.get(key)); this.configRemoved.put(key, this.configProps.get(key));
i.remove(); i.remove();
} }
} }
@ -132,8 +140,8 @@ public class serverSwitch {
// merge new props from init to config // merge new props from init to config
// this is necessary for migration, when new properties are attached // this is necessary for migration, when new properties are attached
initProps.putAll(configProps); initProps.putAll(this.configProps);
configProps = initProps; this.configProps = initProps;
// save result; this may initially create a config file after // save result; this may initially create a config file after
// initialization // initialization
@ -141,48 +149,50 @@ public class serverSwitch {
} }
// other settings // other settings
authorization = new ConcurrentHashMap<InetAddress, String>(); this.authorization = new ConcurrentHashMap<InetAddress, String>();
// init thread control // init thread control
workerThreads = new TreeMap<String, BusyThread>(); this.workerThreads = new TreeMap<String, BusyThread>();
// init busy state control // init busy state control
serverJobs = 0; this.serverJobs = 0;
// init server tracking // init server tracking
this.accessTracker = new serverAccessTracker( this.accessTracker =
getConfigLong("server.maxTrackingTime", 60 * 60 * 1000), new serverAccessTracker(
(int) getConfigLong("server.maxTrackingCount", 1000), getConfigLong("server.maxTrackingTime", 60 * 60 * 1000),
(int) getConfigLong("server.maxTrackingHostCount", 100) (int) getConfigLong("server.maxTrackingCount", 1000),
); (int) getConfigLong("server.maxTrackingHostCount", 100));
} }
public String myPublicIP() { public String myPublicIP() {
// if a static IP was configured, we have to return it here ... // if a static IP was configured, we have to return it here ...
final String staticIP = getConfig("staticIP", ""); final String staticIP = getConfig("staticIP", "");
if (staticIP.length() > 0) { if ( staticIP.length() > 0 ) {
return staticIP; return staticIP;
} }
// otherwise we return the real IP address of this host // otherwise we return the real IP address of this host
final InetAddress pLIP = Domains.myPublicLocalIP(); final InetAddress pLIP = Domains.myPublicLocalIP();
if (pLIP != null) return pLIP.getHostAddress(); if ( pLIP != null ) {
return pLIP.getHostAddress();
}
return null; return null;
} }
// a logger for this switchboard // a logger for this switchboard
public void setLog(final Log log) { public void setLog(final Log log) {
this.log = log; this.log = log;
} }
public Log getLog() { public Log getLog() {
return log; return this.log;
} }
public void setConfig(final Map<String, String> otherConfigs) { public void setConfig(final Map<String, String> otherConfigs) {
final Iterator<Map.Entry<String, String>> i = otherConfigs.entrySet().iterator(); final Iterator<Map.Entry<String, String>> i = otherConfigs.entrySet().iterator();
Map.Entry<String, String> entry; Map.Entry<String, String> entry;
while (i.hasNext()) { while ( i.hasNext() ) {
entry = i.next(); entry = i.next();
setConfig(entry.getKey(), entry.getValue()); setConfig(entry.getKey(), entry.getValue());
} }
@ -202,94 +212,99 @@ public class serverSwitch {
public void setConfig(final String key, final String value) { public void setConfig(final String key, final String value) {
// set the value // set the value
final String oldValue = configProps.put(key, value); final String oldValue = this.configProps.put(key, value);
if (oldValue == null || !value.equals(oldValue)) saveConfig(); if ( oldValue == null || !value.equals(oldValue) ) {
saveConfig();
}
} }
public void removeConfig(final String key) { public void removeConfig(final String key) {
configProps.remove(key); this.configProps.remove(key);
} }
/** /**
* Gets a configuration parameter from the properties. * Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter * @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be * @param dflt default value which will be used in case parameter can not be found or if it is invalid
* found or if it is invalid
* @return value if the parameter or default value * @return value if the parameter or default value
*/ */
public String getConfig(final String key, final String dflt) { public String getConfig(final String key, final String dflt) {
// get the value // get the value
final String s = configProps.get(key); final String s = this.configProps.get(key);
// return value // return value
if (s == null) return dflt; if ( s == null ) {
return dflt;
}
return s; return s;
} }
/** /**
* Gets a configuration parameter from the properties. * Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter * @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be * @param dflt default value which will be used in case parameter can not be found or if it is invalid
* found or if it is invalid
* @return value if the parameter or default value * @return value if the parameter or default value
*/ */
public long getConfigLong(final String key, final long dflt) { public long getConfigLong(final String key, final long dflt) {
try { try {
return Long.parseLong(getConfig(key, Long.toString(dflt))); return Long.parseLong(getConfig(key, Long.toString(dflt)));
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
return dflt; return dflt;
} }
} }
/** /**
* Gets a configuration parameter from the properties. * Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter * @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be * @param dflt default value which will be used in case parameter can not be found or if it is invalid
* found or if it is invalid
* @return value if the parameter or default value * @return value if the parameter or default value
*/ */
public double getConfigFloat(final String key, final float dflt) { public double getConfigFloat(final String key, final float dflt) {
try { try {
return Float.parseFloat(getConfig(key, Float.toString(dflt))); return Float.parseFloat(getConfig(key, Float.toString(dflt)));
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
return dflt; return dflt;
} }
} }
/** /**
* Gets a configuration parameter from the properties. * Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter * @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be * @param dflt default value which will be used in case parameter can not be found or if it is invalid
* found or if it is invalid
* @return value if the parameter or default value * @return value if the parameter or default value
*/ */
public int getConfigInt(final String key, final int dflt) { public int getConfigInt(final String key, final int dflt) {
try { try {
return Integer.parseInt(getConfig(key, Integer.toString(dflt))); return Integer.parseInt(getConfig(key, Integer.toString(dflt)));
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
return dflt; return dflt;
} }
} }
/** /**
* Gets a configuration parameter from the properties. * Gets a configuration parameter from the properties.
*
* @param key name of the configuration parameter * @param key name of the configuration parameter
* @param dflt default value which will be used in case parameter can not be * @param dflt default value which will be used in case parameter can not be found or if it is invalid
* found or if it is invalid
* @return value if the parameter or default value * @return value if the parameter or default value
*/ */
public boolean getConfigBool(final String key, final boolean dflt) { public boolean getConfigBool(final String key, final boolean dflt) {
return Boolean.parseBoolean(getConfig(key, Boolean.toString(dflt))); return Boolean.parseBoolean(getConfig(key, Boolean.toString(dflt)));
} }
/** /**
* Create a File instance for a configuration setting specifying a path. * Create a File instance for a configuration setting specifying a path.
* @param key config key *
* @param dflt default path value, that is used when there is no value * @param key config key
* <code>key</code> in the configuration. * @param dflt default path value, that is used when there is no value <code>key</code> in the
* @return if the value of the setting is an absolute path String, then the * configuration.
* returned File is derived from this setting only. Otherwise the path's file * @return if the value of the setting is an absolute path String, then the returned File is derived from
* is constructed from the applications root path + the relative path setting. * this setting only. Otherwise the path's file is constructed from the applications root path +
* the relative path setting.
*/ */
public File getDataPath(final String key, final String dflt) { public File getDataPath(final String key, final String dflt) {
File ret; File ret;
@ -298,7 +313,7 @@ public class serverSwitch {
ret = (f.isAbsolute() ? new File(f.getAbsolutePath()) : new File(this.dataPath, path)); ret = (f.isAbsolute() ? new File(f.getAbsolutePath()) : new File(this.dataPath, path));
return ret; return ret;
} }
public File getAppPath(final String key, final String dflt) { public File getAppPath(final String key, final String dflt) {
File ret; File ret;
final String path = getConfig(key, dflt).replace('\\', '/'); final String path = getConfig(key, dflt).replace('\\', '/');
@ -308,265 +323,296 @@ public class serverSwitch {
} }
public Iterator<String> configKeys() { public Iterator<String> configKeys() {
return configProps.keySet().iterator(); return this.configProps.keySet().iterator();
} }
private void saveConfig() { private void saveConfig() {
try { ConcurrentMap<String, String> configPropsCopy = new ConcurrentHashMap<String, String>();
ConcurrentMap<String, String> configPropsCopy = new ConcurrentHashMap<String, String>(); configPropsCopy.putAll(this.configProps); // avoid concurrency problems
configPropsCopy.putAll(configProps); // avoid concurrency problems FileUtils.saveMap(this.configFile, configPropsCopy, this.configComment);
FileUtils.saveMap(configFile, configPropsCopy, configComment);
} catch (final IOException e) {
log.logSevere("CONFIG: Cannot write config file " + configFile.toString() + ": " + e.getMessage(), e);
//System.out.println("ERROR: cannot write config file " + configFile.toString() + ": " + e.getMessage());
}
} }
/** /**
* Gets configuration parameters which have been removed during initialization. * Gets configuration parameters which have been removed during initialization.
*
* @return contains parameter name as key and parameter value as value * @return contains parameter name as key and parameter value as value
*/ */
public ConcurrentMap<String, String> getRemoved() { public ConcurrentMap<String, String> getRemoved() {
return configRemoved; return this.configRemoved;
} }
public void deployThread( public void deployThread(
final String threadName, final String threadName,
final String threadShortDescription, final String threadShortDescription,
final String threadLongDescription, final String threadLongDescription,
final String threadMonitorURL, final String threadMonitorURL,
final BusyThread newThread, final BusyThread newThread,
final long startupDelay) { final long startupDelay) {
deployThread(threadName, threadShortDescription, threadLongDescription, threadMonitorURL, deployThread(
newThread, startupDelay, threadName,
Long.parseLong(getConfig(threadName + "_idlesleep" , "100")), threadShortDescription,
Long.parseLong(getConfig(threadName + "_busysleep" , "1000")), threadLongDescription,
Long.parseLong(getConfig(threadName + "_memprereq" , "1000000"))); threadMonitorURL,
newThread,
startupDelay,
Long.parseLong(getConfig(threadName + "_idlesleep", "100")),
Long.parseLong(getConfig(threadName + "_busysleep", "1000")),
Long.parseLong(getConfig(threadName + "_memprereq", "1000000")));
} }
public void deployThread( public void deployThread(
final String threadName, final String threadName,
final String threadShortDescription, final String threadShortDescription,
final String threadLongDescription, final String threadLongDescription,
final String threadMonitorURL, final String threadMonitorURL,
final BusyThread newThread, final BusyThread newThread,
final long startupDelay, final long startupDelay,
final long initialIdleSleep, final long initialIdleSleep,
final long initialBusySleep, final long initialBusySleep,
final long initialMemoryPreRequisite) { final long initialMemoryPreRequisite) {
if (newThread.isAlive()) throw new RuntimeException("undeployed threads must not live; they are started as part of the deployment"); if ( newThread.isAlive() ) {
throw new RuntimeException(
"undeployed threads must not live; they are started as part of the deployment");
}
newThread.setStartupSleep(startupDelay); newThread.setStartupSleep(startupDelay);
long x; long x;
try { try {
x = Long.parseLong(getConfig(threadName + "_idlesleep" , "novalue")); x = Long.parseLong(getConfig(threadName + "_idlesleep", "novalue"));
newThread.setIdleSleep(x); newThread.setIdleSleep(x);
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
newThread.setIdleSleep(initialIdleSleep); newThread.setIdleSleep(initialIdleSleep);
setConfig(threadName + "_idlesleep", initialIdleSleep); setConfig(threadName + "_idlesleep", initialIdleSleep);
} }
try { try {
x = Long.parseLong(getConfig(threadName + "_busysleep" , "novalue")); x = Long.parseLong(getConfig(threadName + "_busysleep", "novalue"));
newThread.setBusySleep(x); newThread.setBusySleep(x);
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
newThread.setBusySleep(initialBusySleep); newThread.setBusySleep(initialBusySleep);
setConfig(threadName + "_busysleep", initialBusySleep); setConfig(threadName + "_busysleep", initialBusySleep);
} }
try { try {
x = Long.parseLong(getConfig(threadName + "_memprereq" , "novalue")); x = Long.parseLong(getConfig(threadName + "_memprereq", "novalue"));
newThread.setMemPreReqisite(x); newThread.setMemPreReqisite(x);
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
newThread.setMemPreReqisite(initialMemoryPreRequisite); newThread.setMemPreReqisite(initialMemoryPreRequisite);
setConfig(threadName + "_memprereq", initialMemoryPreRequisite); setConfig(threadName + "_memprereq", initialMemoryPreRequisite);
} }
newThread.setDescription(threadShortDescription, threadLongDescription, threadMonitorURL); newThread.setDescription(threadShortDescription, threadLongDescription, threadMonitorURL);
workerThreads.put(threadName, newThread); this.workerThreads.put(threadName, newThread);
// start the thread // start the thread
if (workerThreads.containsKey(threadName)) newThread.start(); if ( this.workerThreads.containsKey(threadName) ) {
newThread.start();
}
} }
public BusyThread getThread(final String threadName) { public BusyThread getThread(final String threadName) {
return workerThreads.get(threadName); return this.workerThreads.get(threadName);
} }
public void setThreadPerformance(final String threadName, final long idleMillis, final long busyMillis, final long memprereqBytes) { public void setThreadPerformance(
final BusyThread thread = workerThreads.get(threadName); final String threadName,
if (thread != null) { final long idleMillis,
final long busyMillis,
final long memprereqBytes) {
final BusyThread thread = this.workerThreads.get(threadName);
if ( thread != null ) {
setConfig(threadName + "_idlesleep", thread.setIdleSleep(idleMillis)); setConfig(threadName + "_idlesleep", thread.setIdleSleep(idleMillis));
setConfig(threadName + "_busysleep", thread.setBusySleep(busyMillis)); setConfig(threadName + "_busysleep", thread.setBusySleep(busyMillis));
setConfig(threadName + "_memprereq", memprereqBytes); setConfig(threadName + "_memprereq", memprereqBytes);
thread.setMemPreReqisite(memprereqBytes); thread.setMemPreReqisite(memprereqBytes);
} }
} }
public synchronized void terminateThread(final String threadName, final boolean waitFor) { public synchronized void terminateThread(final String threadName, final boolean waitFor) {
if (workerThreads.containsKey(threadName)) { if ( this.workerThreads.containsKey(threadName) ) {
((WorkflowThread) workerThreads.get(threadName)).terminate(waitFor); ((WorkflowThread) this.workerThreads.get(threadName)).terminate(waitFor);
workerThreads.remove(threadName); this.workerThreads.remove(threadName);
} }
} }
public void intermissionAllThreads(final long pause) { public void intermissionAllThreads(final long pause) {
final Iterator<String> e = workerThreads.keySet().iterator(); final Iterator<String> e = this.workerThreads.keySet().iterator();
while (e.hasNext()) { while ( e.hasNext() ) {
workerThreads.get(e.next()).intermission(pause); this.workerThreads.get(e.next()).intermission(pause);
} }
} }
public synchronized void terminateAllThreads(final boolean waitFor) { public synchronized void terminateAllThreads(final boolean waitFor) {
Iterator<String> e = workerThreads.keySet().iterator(); Iterator<String> e = this.workerThreads.keySet().iterator();
while (e.hasNext()) { while ( e.hasNext() ) {
((WorkflowThread) workerThreads.get(e.next())).terminate(false); ((WorkflowThread) this.workerThreads.get(e.next())).terminate(false);
} }
if (waitFor) { if ( waitFor ) {
e = workerThreads.keySet().iterator(); e = this.workerThreads.keySet().iterator();
while (e.hasNext()) { while ( e.hasNext() ) {
((WorkflowThread) workerThreads.get(e.next())).terminate(true); ((WorkflowThread) this.workerThreads.get(e.next())).terminate(true);
e.remove(); e.remove();
} }
} }
} }
public String[] sessionsOlderThan(String threadName, long timeout) { public String[] sessionsOlderThan(String threadName, long timeout) {
final List<String> list = new ArrayList<String>(); final List<String> list = new ArrayList<String>();
final WorkflowThread st = getThread(threadName); final WorkflowThread st = getThread(threadName);
for (final Session s: ((serverCore) st).getJobList()) { for ( final Session s : ((serverCore) st).getJobList() ) {
if (!s.isAlive()) continue; if ( !s.isAlive() ) {
if (s.getTime() > timeout) { continue;
}
if ( s.getTime() > timeout ) {
list.add(s.getName()); list.add(s.getName());
} }
} }
return (String[]) list.toArray(); return (String[]) list.toArray();
} }
public void closeSessions(String threadName, String sessionName) { public void closeSessions(String threadName, String sessionName) {
if (sessionName == null) return; if ( sessionName == null ) {
return;
}
final WorkflowThread st = getThread(threadName); final WorkflowThread st = getThread(threadName);
for (final Session s: ((serverCore) st).getJobList()) { for ( final Session s : ((serverCore) st).getJobList() ) {
if ( if ( (s.isAlive()) && (s.getName().equals(sessionName)) ) {
(s.isAlive()) &&
(s.getName().equals(sessionName))
) {
// try to stop session // try to stop session
s.setStopped(true); s.setStopped(true);
try { Thread.sleep(100); } catch (final InterruptedException ex) {} try {
Thread.sleep(100);
} catch ( final InterruptedException ex ) {
}
// try to interrupt session // try to interrupt session
s.interrupt(); s.interrupt();
try { Thread.sleep(100); } catch (final InterruptedException ex) {} try {
Thread.sleep(100);
} catch ( final InterruptedException ex ) {
}
// try to close socket // try to close socket
if (s.isAlive()) { if ( s.isAlive() ) {
s.close(); s.close();
} }
// wait for session to finish // wait for session to finish
if (s.isAlive()) { if ( s.isAlive() ) {
try { s.join(500); } catch (final InterruptedException ex) {} try {
s.join(500);
} catch ( final InterruptedException ex ) {
}
} }
} }
} }
} }
public Iterator<String> /*of serverThread-Names (String)*/ threadNames() { public Iterator<String> /*of serverThread-Names (String)*/threadNames() {
return workerThreads.keySet().iterator(); return this.workerThreads.keySet().iterator();
} }
// authentication routines: // authentication routines:
public void setAuthentify(final InetAddress host, final String user, final String rights) { public void setAuthentify(final InetAddress host, final String user, final String rights) {
// sets access attributes according to host addresses // sets access attributes according to host addresses
authorization.put(host, user + "@" + rights); this.authorization.put(host, user + "@" + rights);
} }
public void removeAuthentify(final InetAddress host) { public void removeAuthentify(final InetAddress host) {
// remove access attributes according to host addresses // remove access attributes according to host addresses
authorization.remove(host); this.authorization.remove(host);
} }
public String getAuthentifyUser(final InetAddress host) { public String getAuthentifyUser(final InetAddress host) {
// read user name according to host addresses // read user name according to host addresses
final String a = authorization.get(host); final String a = this.authorization.get(host);
if (a == null) return null; if ( a == null ) {
final int p = a.indexOf('@'); return null;
if (p < 0) return null; }
return a.substring(0, p); final int p = a.indexOf('@');
if ( p < 0 ) {
return null;
}
return a.substring(0, p);
} }
public String getAuthentifyRights(final InetAddress host) { public String getAuthentifyRights(final InetAddress host) {
// read access rigths according to host addresses // read access rigths according to host addresses
final String a = authorization.get(host); final String a = this.authorization.get(host);
if (a == null) return null; if ( a == null ) {
final int p = a.indexOf('@'); return null;
if (p < 0) return null; }
return a.substring(p + 1); final int p = a.indexOf('@');
if ( p < 0 ) {
return null;
}
return a.substring(p + 1);
} }
public void addAuthentifyRight(final InetAddress host, final String right) { public void addAuthentifyRight(final InetAddress host, final String right) {
final String rights = getAuthentifyRights(host); final String rights = getAuthentifyRights(host);
if (rights == null) { if ( rights == null ) {
// create new authentication // create new authentication
setAuthentify(host, "unknown", right); setAuthentify(host, "unknown", right);
} else { } else {
// add more authentication // add more authentication
final String user = getAuthentifyUser(host); final String user = getAuthentifyUser(host);
setAuthentify(host, user, rights + right); setAuthentify(host, user, rights + right);
} }
} }
public boolean hasAuthentifyRight(final InetAddress host, final String right) { public boolean hasAuthentifyRight(final InetAddress host, final String right) {
final String rights = getAuthentifyRights(host); final String rights = getAuthentifyRights(host);
if (rights == null) return false; if ( rights == null ) {
return rights.indexOf(right) >= 0; return false;
}
return rights.indexOf(right) >= 0;
} }
public File getDataPath() { public File getDataPath() {
return this.dataPath; return this.dataPath;
} }
public File getAppPath() { public File getAppPath() {
return this.appPath; return this.appPath;
} }
@Override @Override
public String toString() { public String toString() {
return configProps.toString(); return this.configProps.toString();
} }
public void handleBusyState(final int jobs) { public void handleBusyState(final int jobs) {
serverJobs = jobs; this.serverJobs = jobs;
} }
public void track(final String host, final String accessPath) { public void track(final String host, final String accessPath) {
this.accessTracker.track(host, accessPath); this.accessTracker.track(host, accessPath);
} }
public Collection<Track> accessTrack(final String host) { public Collection<Track> accessTrack(final String host) {
return this.accessTracker.accessTrack(host); return this.accessTracker.accessTrack(host);
} }
public int latestAccessCount(final String host, final long timedelta) { public int latestAccessCount(final String host, final long timedelta) {
return this.accessTracker.latestAccessCount(host, timedelta); return this.accessTracker.latestAccessCount(host, timedelta);
} }
public Iterator<String> accessHosts() { public Iterator<String> accessHosts() {
return this.accessTracker.accessHosts(); return this.accessTracker.accessHosts();
} }
/** /**
* Retrieve text data (e. g. config file) from file * Retrieve text data (e. g. config file) from file file may be an url or a filename with path relative to
* rootPath parameter
* *
* file may be an url or a filename with path relative to rootPath parameter
* @param file url or filename * @param file url or filename
* @param rootPath searchpath for file * @param rootPath searchpath for file
* @param file file to use when remote fetching fails (null if unused) * @param file file to use when remote fetching fails (null if unused)
*/ */
public Reader getConfigFileFromWebOrLocally(final String uri, public Reader getConfigFileFromWebOrLocally(final String uri, final String rootPath, final File file)
final String rootPath, final File file) throws IOException, FileNotFoundException { throws IOException,
if (uri.startsWith("http://") || uri.startsWith("https://")) { FileNotFoundException {
if ( uri.startsWith("http://") || uri.startsWith("https://") ) {
final String[] uris = uri.split(","); final String[] uris = uri.split(",");
for (String netdef: uris) { for ( String netdef : uris ) {
netdef = netdef.trim(); netdef = netdef.trim();
try { try {
final RequestHeader reqHeader = new RequestHeader(); final RequestHeader reqHeader = new RequestHeader();
@ -574,52 +620,57 @@ public class serverSwitch {
final HTTPClient client = new HTTPClient(); final HTTPClient client = new HTTPClient();
client.setHeader(reqHeader.entrySet()); client.setHeader(reqHeader.entrySet());
byte[] data = client.GETbytes(uri); byte[] data = client.GETbytes(uri);
if (data == null || data.length == 0) continue; if ( data == null || data.length == 0 ) {
continue;
}
// save locally in case next fetch fails // save locally in case next fetch fails
if (file != null) { if ( file != null ) {
FileOutputStream f = new FileOutputStream(file); FileOutputStream f = new FileOutputStream(file);
f.write(data); f.write(data);
f.close(); f.close();
} }
return new InputStreamReader(new BufferedInputStream(new ByteArrayInputStream(data))); return new InputStreamReader(new BufferedInputStream(new ByteArrayInputStream(data)));
} catch (final Exception e) { } catch ( final Exception e ) {
continue; continue;
} }
} }
if (file != null && file.exists()) { if ( file != null && file.exists() ) {
return new FileReader(file); return new FileReader(file);
} else { } else {
throw new FileNotFoundException(); throw new FileNotFoundException();
} }
} else { } else {
final File f = (uri.length() > 0 && uri.startsWith("/")) ? new File(uri) : new File(rootPath, uri); final File f =
if (f.exists()) { (uri.length() > 0 && uri.startsWith("/")) ? new File(uri) : new File(rootPath, uri);
return new FileReader(f); if ( f.exists() ) {
return new FileReader(f);
} else { } else {
throw new FileNotFoundException(f.toString()); throw new FileNotFoundException(f.toString());
} }
} }
} }
private static Random pwGenerator = new Random(); private static Random pwGenerator = new Random();
/** /**
* Generates a random password. * Generates a random password.
*
* @return random password which is 20 characters long. * @return random password which is 20 characters long.
*/ */
public String genRandomPassword() { public String genRandomPassword() {
return genRandomPassword(20); return genRandomPassword(20);
} }
/** /**
* Generates a random password of a given length. * Generates a random password of a given length.
*
* @param length length o password * @param length length o password
* @return password of given length * @return password of given length
*/ */
public String genRandomPassword(final int length) { public String genRandomPassword(final int length) {
byte[] bytes = new byte[length]; byte[] bytes = new byte[length];
pwGenerator.nextBytes(bytes); pwGenerator.nextBytes(bytes);
return Digest.encodeMD5Hex(bytes); return Digest.encodeMD5Hex(bytes);
} }
} }

File diff suppressed because it is too large Load Diff

@ -28,7 +28,6 @@
package net.yacy.peers.graphics; package net.yacy.peers.graphics;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.text.ParseException; import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
@ -64,15 +63,15 @@ import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.LookAheadIterator; import net.yacy.kelondro.util.LookAheadIterator;
public class WebStructureGraph
public class WebStructureGraph { {
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia) public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 50000; // maximum number of hosts in web structure map public static int maxhosts = 50000; // maximum number of hosts in web structure map
private final static Log log = new Log("WebStructureGraph"); private final static Log log = new Log("WebStructureGraph");
private final File structureFile; private final File structureFile;
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}* private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, String> structure_new; private final TreeMap<String, String> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue; private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
@ -80,9 +79,11 @@ public class WebStructureGraph {
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null); private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null);
private static class leanrefObject { private static class leanrefObject
{
private final DigestURI url; private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs; private final Set<MultiProtocolURI> globalRefURLs;
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) { private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
this.url = url; this.url = url;
this.globalRefURLs = globalRefURLs; this.globalRefURLs = globalRefURLs;
@ -98,73 +99,92 @@ public class WebStructureGraph {
// load web structure // load web structure
Map<String, String> loadedStructure; Map<String, String> loadedStructure;
try { try {
loadedStructure = (this.structureFile.exists()) ? FileUtils.loadMap(this.structureFile) : new TreeMap<String, String>(); loadedStructure =
} catch (final OutOfMemoryError e) { (this.structureFile.exists())
? FileUtils.loadMap(this.structureFile)
: new TreeMap<String, String>();
} catch ( final OutOfMemoryError e ) {
loadedStructure = new TreeMap<String, String>(); loadedStructure = new TreeMap<String, String>();
} }
if (loadedStructure != null) this.structure_old.putAll(loadedStructure); if ( loadedStructure != null ) {
this.structure_old.putAll(loadedStructure);
}
// delete out-dated entries in case the structure is too big // delete out-dated entries in case the structure is too big
if (this.structure_old.size() > maxhosts) { if ( this.structure_old.size() > maxhosts ) {
// fill a set with last-modified - dates of the structure // fill a set with last-modified - dates of the structure
final TreeSet<String> delset = new TreeSet<String>(); final TreeSet<String> delset = new TreeSet<String>();
String key, value; String key, value;
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) { for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
key = entry.getKey(); key = entry.getKey();
value = entry.getValue(); value = entry.getValue();
if (value.length() >= 8) delset.add(value.substring(0, 8) + key); if ( value.length() >= 8 ) {
} delset.add(value.substring(0, 8) + key);
int delcount = this.structure_old.size() - (maxhosts * 9 / 10); }
final Iterator<String> j = delset.iterator(); }
while ((delcount > 0) && (j.hasNext())) { int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
this.structure_old.remove(j.next().substring(8)); final Iterator<String> j = delset.iterator();
delcount--; while ( (delcount > 0) && (j.hasNext()) ) {
} this.structure_old.remove(j.next().substring(8));
delcount--;
}
} }
this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess(); this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess();
this.publicRefDNSResolvingWorker.start(); this.publicRefDNSResolvingWorker.start();
} }
private class PublicRefDNSResolvingProcess extends Thread { private class PublicRefDNSResolvingProcess extends Thread
{
private PublicRefDNSResolvingProcess() { private PublicRefDNSResolvingProcess() {
} }
@Override
public void run() { public void run() {
leanrefObject lro; leanrefObject lro;
try { try {
while ((lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON) { while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) {
learnrefs(lro); learnrefs(lro);
} }
} catch (final InterruptedException e) { } catch ( final InterruptedException e ) {
} }
} }
} }
public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser) { public void generateCitationReference(
final DigestURI url,
final Document document,
final Condenser condenser) {
// generate citation reference // generate citation reference
if (url.isLocal()) return; // we do this only for global urls if ( url.isLocal() ) {
return; // we do this only for global urls
}
final Map<MultiProtocolURI, String> hl = document.getHyperlinks(); final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
final Iterator<MultiProtocolURI> it = hl.keySet().iterator(); final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>(); final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
final String refhost = url.getHost(); final String refhost = url.getHost();
MultiProtocolURI u; MultiProtocolURI u;
int maxref = 1000; int maxref = 1000;
while (it.hasNext() && maxref-- > 0) { while ( it.hasNext() && maxref-- > 0 ) {
u = it.next(); u = it.next();
if (u == null) continue; if ( u == null ) {
if (refhost != null && u.getHost() != null && !u.getHost().equals(refhost)) { continue;
}
if ( refhost != null && u.getHost() != null && !u.getHost().equals(refhost) ) {
// this is a global link // this is a global link
globalRefURLs.add(u); globalRefURLs.add(u);
} }
} }
final leanrefObject lro = new leanrefObject(url, globalRefURLs); final leanrefObject lro = new leanrefObject(url, globalRefURLs);
if (globalRefURLs.size() > 0) try { if ( globalRefURLs.size() > 0 ) {
if (this.publicRefDNSResolvingWorker.isAlive()) { try {
this.publicRefDNSResolvingQueue.put(lro); if ( this.publicRefDNSResolvingWorker.isAlive() ) {
} else { this.publicRefDNSResolvingQueue.put(lro);
} else {
learnrefs(lro);
}
} catch ( final InterruptedException e ) {
learnrefs(lro); learnrefs(lro);
} }
} catch (final InterruptedException e) {
learnrefs(lro);
} }
} }
@ -173,16 +193,22 @@ public class WebStructureGraph {
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
//final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part //final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
String nexturlhash; String nexturlhash;
for (final MultiProtocolURI u: lro.globalRefURLs) { for ( final MultiProtocolURI u : lro.globalRefURLs ) {
final byte[] nexturlhashb = new DigestURI(u).hash(); final byte[] nexturlhashb = new DigestURI(u).hash();
assert nexturlhashb != null; assert nexturlhashb != null;
if (nexturlhashb != null) { if ( nexturlhashb != null ) {
nexturlhash = ASCII.String(nexturlhashb); nexturlhash = ASCII.String(nexturlhashb);
assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash; assert nexturlhash.length() == 12 : "nexturlhash.length() = "
+ nexturlhash.length()
+ ", nexturlhash = "
+ nexturlhash;
//assert !nexturlhash.substring(6).equals(refhashp); //assert !nexturlhash.substring(6).equals(refhashp);
// this is a global link // this is a global link
cpg.append(nexturlhash); // store complete hash cpg.append(nexturlhash); // store complete hash
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); assert cpg.length() % 12 == 0 : "cpg.length() = "
+ cpg.length()
+ ", cpg = "
+ cpg.toString();
} }
} }
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
@ -190,22 +216,26 @@ public class WebStructureGraph {
} }
private static int refstr2count(final String refs) { private static int refstr2count(final String refs) {
if ((refs == null) || (refs.length() <= 8)) return 0; if ( (refs == null) || (refs.length() <= 8) ) {
return 0;
}
assert (refs.length() - 8) % 10 == 0 : "refs = " + refs + ", length = " + refs.length(); assert (refs.length() - 8) % 10 == 0 : "refs = " + refs + ", length = " + refs.length();
return (refs.length() - 8) / 10; return (refs.length() - 8) / 10;
} }
static Map<String, Integer> refstr2map(final String refs) { static Map<String, Integer> refstr2map(final String refs) {
if ((refs == null) || (refs.length() <= 8)) return new HashMap<String, Integer>(); if ( (refs == null) || (refs.length() <= 8) ) {
return new HashMap<String, Integer>();
}
final Map<String, Integer> map = new HashMap<String, Integer>(); final Map<String, Integer> map = new HashMap<String, Integer>();
String c; String c;
final int refsc = refstr2count(refs); final int refsc = refstr2count(refs);
int d; int d;
for (int i = 0; i < refsc; i++) { for ( int i = 0; i < refsc; i++ ) {
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10); c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
try { try {
d = Integer.valueOf(c.substring(6), 16); d = Integer.valueOf(c.substring(6), 16);
} catch (final NumberFormatException e) { } catch ( final NumberFormatException e ) {
d = 1; d = 1;
} }
map.put(c.substring(0, 6), d); map.put(c.substring(0, 6), d);
@ -217,19 +247,19 @@ public class WebStructureGraph {
final StringBuilder s = new StringBuilder(map.size() * 10); final StringBuilder s = new StringBuilder(map.size() * 10);
s.append(GenericFormatter.SHORT_DAY_FORMATTER.format()); s.append(GenericFormatter.SHORT_DAY_FORMATTER.format());
String h; String h;
for (final Map.Entry<String, Integer> entry : map.entrySet()) { for ( final Map.Entry<String, Integer> entry : map.entrySet() ) {
s.append(entry.getKey()); s.append(entry.getKey());
h = Integer.toHexString(entry.getValue().intValue()); h = Integer.toHexString(entry.getValue().intValue());
final int hl = h.length(); final int hl = h.length();
if (hl == 0) { if ( hl == 0 ) {
s.append("0000"); s.append("0000");
} else if (hl == 1) { } else if ( hl == 1 ) {
s.append("000").append(h); s.append("000").append(h);
} else if (hl == 2) { } else if ( hl == 2 ) {
s.append("00").append(h); s.append("00").append(h);
} else if (hl == 3) { } else if ( hl == 3 ) {
s.append('0').append(h); s.append('0').append(h);
} else if (hl == 4) { } else if ( hl == 4 ) {
s.append(h); s.append(h);
} else { } else {
s.append("FFFF"); s.append("FFFF");
@ -246,11 +276,11 @@ public class WebStructureGraph {
String hostname = ""; String hostname = "";
String date = ""; String date = "";
String ref; String ref;
synchronized (this.structure_old) { synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash); tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
hostname = key.substring(7); hostname = key.substring(7);
ref = tailMap.get(key); ref = tailMap.get(key);
date = ref.substring(0, 8); date = ref.substring(0, 8);
@ -258,68 +288,87 @@ public class WebStructureGraph {
} }
} }
} }
synchronized (this.structure_new) { synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash); tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
ref = tailMap.get(key); ref = tailMap.get(key);
if (hostname.length() == 0) hostname = key.substring(7); if ( hostname.length() == 0 ) {
if (date.length() == 0) date = ref.substring(0, 8); hostname = key.substring(7);
}
if ( date.length() == 0 ) {
date = ref.substring(0, 8);
}
h.putAll(refstr2map(ref)); h.putAll(refstr2map(ref));
} }
} }
} }
if (h.isEmpty()) return null; if ( h.isEmpty() ) {
return null;
}
return new StructureEntry(hosthash, hostname, date, h); return new StructureEntry(hosthash, hostname, date, h);
} }
public StructureEntry incomingReferences(final String hosthash) { public StructureEntry incomingReferences(final String hosthash) {
final String hostname = hostHash2hostName(hosthash); final String hostname = hostHash2hostName(hosthash);
if (hostname == null) return null; if ( hostname == null ) {
return null;
}
// collect the references // collect the references
WebStructureGraph.StructureEntry sentry; WebStructureGraph.StructureEntry sentry;
final HashMap<String, Integer> hosthashes = new HashMap<String, Integer>(); final HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false); Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false);
while (i.hasNext()) { while ( i.hasNext() ) {
sentry = i.next(); sentry = i.next();
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash)); if ( sentry.references.containsKey(hosthash) ) {
hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
} }
i = new StructureIterator(true); i = new StructureIterator(true);
while (i.hasNext()) { while ( i.hasNext() ) {
sentry = i.next(); sentry = i.next();
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash)); if ( sentry.references.containsKey(hosthash) ) {
hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
}
} }
// construct a new structureEntry Object // construct a new structureEntry Object
return new StructureEntry( return new StructureEntry(
hosthash, hosthash,
hostname, hostname,
GenericFormatter.SHORT_DAY_FORMATTER.format(), GenericFormatter.SHORT_DAY_FORMATTER.format(),
hosthashes); hosthashes);
} }
public static class HostReferenceFactory implements ReferenceFactory<HostReference> { public static class HostReferenceFactory implements ReferenceFactory<HostReference>
{
private static final Row hostReferenceRow = new Row("String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}", Base64Order.enhancedCoder); private static final Row hostReferenceRow = new Row(
"String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}",
Base64Order.enhancedCoder);
public HostReferenceFactory() { public HostReferenceFactory() {
} }
@Override
public Row getRow() { public Row getRow() {
return hostReferenceRow; return hostReferenceRow;
} }
@Override
public HostReference produceSlow(final Entry e) { public HostReference produceSlow(final Entry e) {
return new HostReference(e); return new HostReference(e);
} }
@Override
public HostReference produceFast(final HostReference e) { public HostReference produceFast(final HostReference e) {
return e; return e;
} }
} }
public static class HostReference extends AbstractReference implements Reference { public static class HostReference extends AbstractReference implements Reference
{
private final Row.Entry entry; private final Row.Entry entry;
@ -339,14 +388,17 @@ public class WebStructureGraph {
this.entry = entry; this.entry = entry;
} }
@Override
public String toPropertyForm() { public String toPropertyForm() {
return this.entry.toPropertyForm(':', true, true, false, true); return this.entry.toPropertyForm(':', true, true, false, true);
} }
@Override
public Entry toKelondroEntry() { public Entry toKelondroEntry() {
return this.entry; return this.entry;
} }
@Override
public byte[] urlhash() { public byte[] urlhash() {
return this.entry.getPrimaryKeyBytes(); return this.entry.getPrimaryKeyBytes();
} }
@ -355,40 +407,50 @@ public class WebStructureGraph {
return (int) this.entry.getColLong(2); return (int) this.entry.getColLong(2);
} }
@Override
public long lastModified() { public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1)); return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1));
} }
@Override
public void join(final Reference r) { public void join(final Reference r) {
// joins two entries into one entry // joins two entries into one entry
final HostReference oe = (HostReference) r; final HostReference oe = (HostReference) r;
// combine date // combine date
final long o = oe.lastModified(); final long o = oe.lastModified();
if (lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o)); if ( lastModified() < o ) {
this.entry.setCol(1, MicroDate.microDateDays(o));
}
// combine count // combine count
final int c = oe.count(); final int c = oe.count();
if (count() < c) this.entry.setCol(2, c); if ( count() < c ) {
this.entry.setCol(2, c);
}
} }
@Override
public Collection<Integer> positions() { public Collection<Integer> positions() {
return new ArrayList<Integer>(0); return new ArrayList<Integer>(0);
} }
} }
public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory(); public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null; public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0; public static long hostReferenceIndexCacheTime = 0;
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
public synchronized ReferenceContainerCache<HostReference> incomingReferences() { public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
// we return a cache if the cache is filled and not stale // we return a cache if the cache is filled and not stale
if (hostReferenceIndexCache != null && if ( hostReferenceIndexCache != null
hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis()) return hostReferenceIndexCache; && hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis() ) {
return hostReferenceIndexCache;
}
// collect the references // collect the references
final ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6); final ReferenceContainerCache<HostReference> idx =
new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
// we iterate over all structure entries. // we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts // one structure entry has information that a specific host links to a list of other hosts
@ -403,40 +465,47 @@ public class WebStructureGraph {
} }
private void incomingReferencesEnrich( private void incomingReferencesEnrich(
final ReferenceContainerCache<HostReference> idx, final ReferenceContainerCache<HostReference> idx,
final Iterator<WebStructureGraph.StructureEntry> structureIterator, final Iterator<WebStructureGraph.StructureEntry> structureIterator,
final long time) { final long time) {
// we iterate over all structure entries. // we iterate over all structure entries.
// one structure entry has information that a specific host links to a list of other hosts // one structure entry has information that a specific host links to a list of other hosts
final long timeout = System.currentTimeMillis() + time; final long timeout = System.currentTimeMillis() + time;
byte[] term; byte[] term;
HostReference hr; HostReference hr;
WebStructureGraph.StructureEntry sentry; WebStructureGraph.StructureEntry sentry;
structureLoop: while (structureIterator.hasNext()) { structureLoop: while ( structureIterator.hasNext() ) {
sentry = structureIterator.next(); sentry = structureIterator.next();
// then we loop over all the hosts that are linked from sentry.hosthash // then we loop over all the hosts that are linked from sentry.hosthash
refloop: for (final Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) { refloop: for ( final Map.Entry<String, Integer> refhosthashandcounter : sentry.references
.entrySet() ) {
term = UTF8.getBytes(refhosthashandcounter.getKey()); term = UTF8.getBytes(refhosthashandcounter.getKey());
try { try {
hr = new HostReference(ASCII.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue()); hr =
} catch (final ParseException e) { new HostReference(
ASCII.getBytes(sentry.hosthash),
GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(),
refhosthashandcounter.getValue().intValue());
} catch ( final ParseException e ) {
continue refloop; continue refloop;
} }
// each term refers to an index entry. look if we already have such an entry // each term refers to an index entry. look if we already have such an entry
ReferenceContainer<HostReference> r = idx.get(term, null); ReferenceContainer<HostReference> r = idx.get(term, null);
try { try {
if (r == null) { if ( r == null ) {
r = new ReferenceContainer<HostReference>(hostReferenceFactory, term); r = new ReferenceContainer<HostReference>(hostReferenceFactory, term);
r.add(hr); r.add(hr);
idx.add(r); idx.add(r);
} else { } else {
r.put(hr); r.put(hr);
} }
} catch (final RowSpaceExceededException e) { } catch ( final RowSpaceExceededException e ) {
continue refloop; continue refloop;
} }
} }
if (System.currentTimeMillis() > timeout) break structureLoop; if ( System.currentTimeMillis() > timeout ) {
break structureLoop;
}
} }
} }
@ -459,23 +528,25 @@ public class WebStructureGraph {
public int referencesCount(final String hosthash) { public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash // returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash; assert hosthash.length() == 6 : "hosthash = " + hosthash;
if (hosthash == null || hosthash.length() != 6) return 0; if ( hosthash == null || hosthash.length() != 6 ) {
return 0;
}
SortedMap<String, String> tailMap; SortedMap<String, String> tailMap;
int c = 0; int c = 0;
synchronized (this.structure_old) { synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash); tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
c = refstr2count(tailMap.get(key)); c = refstr2count(tailMap.get(key));
} }
} }
} }
synchronized (this.structure_new) { synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash); tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
c += refstr2count(tailMap.get(key)); c += refstr2count(tailMap.get(key));
} }
} }
@ -487,20 +558,20 @@ public class WebStructureGraph {
// returns the host as string, null if unknown // returns the host as string, null if unknown
assert hosthash.length() == 6; assert hosthash.length() == 6;
SortedMap<String, String> tailMap; SortedMap<String, String> tailMap;
synchronized(this.structure_old) { synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash); tailMap = this.structure_old.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
return key.substring(7); return key.substring(7);
} }
} }
} }
synchronized(this.structure_new) { synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash); tailMap = this.structure_new.tailMap(hosthash);
if (!tailMap.isEmpty()) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if (key.startsWith(hosthash)) { if ( key.startsWith(hosthash) ) {
return key.substring(7); return key.substring(7);
} }
} }
@ -513,53 +584,61 @@ public class WebStructureGraph {
// parse the new reference string and join it with the stored references // parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(hosthash); final StructureEntry structure = outgoingReferences(hosthash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references; final Map<String, Integer> refs =
assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString(); (structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = "
+ reference.length()
+ ", reference = "
+ reference.toString();
String dom; String dom;
int c; int c;
for (int i = 0; i < reference.length() / 12; i++) { for ( int i = 0; i < reference.length() / 12; i++ ) {
dom = reference.substring(i * 12 + 6, (i + 1) * 12); dom = reference.substring(i * 12 + 6, (i + 1) * 12);
c = 0; c = 0;
if (refs.containsKey(dom)) { if ( refs.containsKey(dom) ) {
c = (refs.get(dom)).intValue(); c = (refs.get(dom)).intValue();
} }
refs.put(dom, Integer.valueOf(++c)); refs.put(dom, Integer.valueOf(++c));
} }
// check if the maxref is exceeded // check if the maxref is exceeded
if (refs.size() > maxref) { if ( refs.size() > maxref ) {
int shrink = refs.size() - (maxref * 9 / 10); int shrink = refs.size() - (maxref * 9 / 10);
delloop: while (shrink > 0) { delloop: while ( shrink > 0 ) {
// shrink the references: the entry with the smallest number of references is removed // shrink the references: the entry with the smallest number of references is removed
int minrefcount = Integer.MAX_VALUE; int minrefcount = Integer.MAX_VALUE;
String minrefkey = null; String minrefkey = null;
findloop: for (final Map.Entry<String, Integer> entry : refs.entrySet()) { findloop: for ( final Map.Entry<String, Integer> entry : refs.entrySet() ) {
if (entry.getValue().intValue() < minrefcount) { if ( entry.getValue().intValue() < minrefcount ) {
minrefcount = entry.getValue().intValue(); minrefcount = entry.getValue().intValue();
minrefkey = entry.getKey(); minrefkey = entry.getKey();
} }
if (minrefcount == 1) break findloop; if ( minrefcount == 1 ) {
break findloop;
}
} }
// remove the smallest // remove the smallest
if (minrefkey == null) break delloop; if ( minrefkey == null ) {
break delloop;
}
refs.remove(minrefkey); refs.remove(minrefkey);
shrink--; shrink--;
} }
} }
// store the map back to the structure // store the map back to the structure
synchronized(this.structure_new) { synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs)); this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
} }
} }
private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) { private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) {
for (final Map.Entry<String, String> e: from.entrySet()) { for ( final Map.Entry<String, String> e : from.entrySet() ) {
if (into.containsKey(e.getKey())) { if ( into.containsKey(e.getKey()) ) {
final Map<String, Integer> s0 = refstr2map(into.get(e.getKey())); final Map<String, Integer> s0 = refstr2map(into.get(e.getKey()));
final Map<String, Integer> s1 = refstr2map(e.getValue()); final Map<String, Integer> s1 = refstr2map(e.getValue());
for (final Map.Entry<String, Integer> r: s1.entrySet()) { for ( final Map.Entry<String, Integer> r : s1.entrySet() ) {
if (s0.containsKey(r.getKey())) { if ( s0.containsKey(r.getKey()) ) {
s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue()); s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue());
} else { } else {
s0.put(r.getKey(), r.getValue().intValue()); s0.put(r.getKey(), r.getValue().intValue());
@ -573,7 +652,7 @@ public class WebStructureGraph {
} }
public void joinOldNew() { public void joinOldNew() {
synchronized(this.structure_new) { synchronized ( this.structure_new ) {
joinStructure(this.structure_old, this.structure_new); joinStructure(this.structure_old, this.structure_new);
this.structure_new.clear(); this.structure_new.clear();
} }
@ -584,10 +663,10 @@ public class WebStructureGraph {
String maxhost = null; String maxhost = null;
int refsize, maxref = 0; int refsize, maxref = 0;
joinOldNew(); joinOldNew();
synchronized(this.structure_new) { synchronized ( this.structure_new ) {
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) { for ( final Map.Entry<String, String> entry : this.structure_old.entrySet() ) {
refsize = entry.getValue().length(); refsize = entry.getValue().length();
if (refsize > maxref) { if ( refsize > maxref ) {
maxref = refsize; maxref = refsize;
maxhost = entry.getKey().substring(7); maxhost = entry.getKey().substring(7);
} }
@ -600,41 +679,59 @@ public class WebStructureGraph {
return new StructureIterator(latest); return new StructureIterator(latest);
} }
private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> { private class StructureIterator extends LookAheadIterator<StructureEntry> implements
Iterator<StructureEntry>
{
private final Iterator<Map.Entry<String, String>> i; private final Iterator<Map.Entry<String, String>> i;
private StructureIterator(final boolean latest) { private StructureIterator(final boolean latest) {
this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old).entrySet().iterator(); this.i =
((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old)
.entrySet()
.iterator();
} }
@Override
public StructureEntry next0() { public StructureEntry next0() {
Map.Entry<String, String> entry = null; Map.Entry<String, String> entry = null;
String dom = null, ref = ""; String dom = null, ref = "";
while (this.i.hasNext()) { while ( this.i.hasNext() ) {
entry = this.i.next(); entry = this.i.next();
ref = entry.getValue(); ref = entry.getValue();
if ((ref.length() - 8) % 10 != 0) continue; if ( (ref.length() - 8) % 10 != 0 ) {
continue;
}
dom = entry.getKey(); dom = entry.getKey();
if (dom.length() >= 8) break; if ( dom.length() >= 8 ) {
break;
}
dom = null; dom = null;
} }
if (entry == null || dom == null) return null; if ( entry == null || dom == null ) {
return null;
}
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length(); assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length();
return new StructureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref)); return new StructureEntry(
dom.substring(0, 6),
dom.substring(7),
ref.substring(0, 8),
refstr2map(ref));
} }
} }
public static class StructureEntry { public static class StructureEntry
{
public String hosthash; // the tail of the host hash public String hosthash; // the tail of the host hash
public String hostname; // the host name public String hostname; // the host name
public String date; // date of latest change public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
private StructureEntry( private StructureEntry(
final String hosthash, final String hosthash,
final String hostname, final String hostname,
final String date, final String date,
final Map<String, Integer> references) { final Map<String, Integer> references) {
this.hosthash = hosthash; this.hosthash = hosthash;
this.hostname = hostname; this.hostname = hostname;
this.date = date; this.date = date;
@ -644,30 +741,42 @@ public class WebStructureGraph {
public void close() { public void close() {
// finish dns resolving queue // finish dns resolving queue
if (this.publicRefDNSResolvingWorker.isAlive()) { if ( this.publicRefDNSResolvingWorker.isAlive() ) {
log.logInfo("Waiting for the DNS Resolving Queue to terminate"); log.logInfo("Waiting for the DNS Resolving Queue to terminate");
try { try {
this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON); this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON);
this.publicRefDNSResolvingWorker.join(5000); this.publicRefDNSResolvingWorker.join(5000);
} catch (final InterruptedException e) { } catch ( final InterruptedException e ) {
} }
} }
// save to web structure file // save to web structure file
log.logInfo("Saving Web Structure File: new = " + this.structure_new.size() + " entries, old = " + this.structure_old.size() + " entries"); log.logInfo("Saving Web Structure File: new = "
+ this.structure_new.size()
+ " entries, old = "
+ this.structure_old.size()
+ " entries");
final long time = System.currentTimeMillis(); final long time = System.currentTimeMillis();
joinOldNew(); joinOldNew();
if (this.structure_old.size() > 0) try { if ( this.structure_old.size() > 0 ) {
synchronized(this.structure_old) { synchronized ( this.structure_old ) {
if (this.structure_old.size() > 0) { if ( this.structure_old.size() > 0 ) {
FileUtils.saveMap(this.structureFile, this.structure_old, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*"); FileUtils
.saveMap(
this.structureFile,
this.structure_old,
"Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
final long t = Math.max(1, System.currentTimeMillis() - time); final long t = Math.max(1, System.currentTimeMillis() - time);
log.logInfo("Saved Web Structure File: " + this.structure_old.size() + " entries in " + t + " milliseconds, " + (this.structure_old.size() * 1000 / t) + " entries/second"); log.logInfo("Saved Web Structure File: "
+ this.structure_old.size()
+ " entries in "
+ t
+ " milliseconds, "
+ (this.structure_old.size() * 1000 / t)
+ " entries/second");
} }
this.structure_old.clear(); this.structure_old.clear();
} }
} catch (final IOException e) {
Log.logException(e);
} }
} }
} }

Loading…
Cancel
Save