- enhancements to DNS IP caching and crawler speed

- bugfixes (NPEs)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7619 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e7860b1239
commit f3baaca920

@ -41,13 +41,13 @@
<dl>
<dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteCache').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteCache').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteIndex">Delete Search Index</label><br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP &amp; FTP Cache</label><br/>
<input type="checkbox" name="deleteRobots" id="deleteRobots" disabled="disabled" /><label for="deleteRobots">Delete robots.txt Cache</label><br/>
<input type="checkbox" name="deleteSearchFl" id="deleteSearchFl" disabled="disabled" /><label for="deleteSearchFl">Delete cached snippet-fetching failures during search</label><br/><br/><br/>
<input type="submit" name="deletecomplete" value="Delete"/>
<input type="submit" name="deletecomplete" id="deletecomplete" value="Delete" disabled="disabled"/>
</dd>
</dl>
</fieldset>

@ -23,11 +23,9 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.net.MalformedURLException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.kelondro.logging.Log;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -57,33 +55,17 @@ public class IndexImportWikimedia_p {
} else {
if (post.containsKey("file")) {
final File sourcefile = new File(post.get("file"));
//final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
/*
if (!name.endsWith("pages-articles.xml.bz2")) {
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
return prop;
}
*/
try {
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
Log.logException(e);
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
}
return prop;
}

@ -196,7 +196,7 @@ public class PerformanceMemory_p {
// other caching structures
prop.putNum("namecacheHit.size", Domains.nameCacheHitSize());
prop.putNum("namecacheMiss.size", Domains.nameCacheMissSize());
prop.putNum("namecache.noCache", Domains.nameCacheNoCachingListSize());
prop.putNum("namecache.noCache", 0);
prop.putNum("blacklistcache.size", Switchboard.urlBlacklist.blacklistCacheSize());
prop.putNum("searchevent.size", SearchEventCache.size());
prop.putNum("searchevent.hit", SearchEventCache.cacheHit);

@ -453,13 +453,13 @@ public class Balancer {
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 2) {
if (loops < 3) {
rest = rest + 1000 * loops;
loops = 0;
}
if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + ((loops - i) * 3) + " seconds remaining...");
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}

@ -51,15 +51,14 @@ public class Domains {
private static final String LOCAL_PATTERNS = "10\\..*,127\\..*,172\\.(1[6-9]|2[0-9]|3[0-1])\\..*,169\\.254\\..*,192\\.168\\..*,localhost";
private static final int MAX_NAME_CACHE_HIT_SIZE = 20000;
private static final int MAX_NAME_CACHE_MISS_SIZE = 20000;
private static final int MAX_NAME_NO_CACHING_LIST_SIZE = 20000;
private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() + 1;
// a dns cache
private static final ARC<String, InetAddress> NAME_CACHE_HIT = new ConcurrentARC<String, InetAddress>(MAX_NAME_CACHE_HIT_SIZE, CONCURRENCY_LEVEL);
private static final ARC<String, String> NAME_CACHE_MISS = new ConcurrentARC<String, String>(MAX_NAME_CACHE_MISS_SIZE, CONCURRENCY_LEVEL);
private static final ARC<String, String> NAME_CACHE_NO_CACHING_LIST = new ConcurrentARC<String, String>(MAX_NAME_NO_CACHING_LIST_SIZE, CONCURRENCY_LEVEL);
public static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
public static final List<Pattern> LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS);
private static final ConcurrentHashMap<String, Object> LOOKUP_SYNC = new ConcurrentHashMap<String, Object>();
private static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
private static final List<Pattern> LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS);
/**
* ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! !
@ -496,46 +495,59 @@ public class Domains {
// try to resolve host by doing a name cache lookup
ip = NAME_CACHE_HIT.get(host);
if (ip != null) return ip;
if (NAME_CACHE_MISS.containsKey(host)) return null;
if (ip != null) {
//System.out.println("DNSLOOKUP-CACHE-HIT(CONC) " + host);
return ip;
}
if (NAME_CACHE_MISS.containsKey(host)) {
//System.out.println("DNSLOOKUP-CACHE-MISS(CONC) " + host);
return null;
}
// call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
try {
boolean doCaching = true;
ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
if ((ip == null) ||
(ip.isLoopbackAddress()) ||
(NAME_CACHE_NO_CACHING_LIST.containsKey(host))
) {
doCaching = false;
} else {
if (matchesList(host, nameCacheNoCachingPatterns)) {
NAME_CACHE_NO_CACHING_LIST.put(host, PRESENT);
doCaching = false;
}
final Object sync_obj_new = new Object();
Object sync_obj = LOOKUP_SYNC.putIfAbsent(host, sync_obj_new);
if (sync_obj == null) sync_obj = sync_obj_new;
synchronized (sync_obj) {
// now look again if the host is in the cache where it may be meanwhile because of the synchronization
ip = NAME_CACHE_HIT.get(host);
if (ip != null) {
//System.out.println("DNSLOOKUP-CACHE-HIT(SYNC) " + host);
return ip;
}
if (NAME_CACHE_MISS.containsKey(host)) {
//System.out.println("DNSLOOKUP-CACHE-MISS(SYNC) " + host);
return null;
}
if (doCaching && ip != null) {
// do the dns lookup on the dns server
//if (!matchesList(host, nameCacheNoCachingPatterns)) System.out.println("DNSLOOKUP " + host);
try {
ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone
} catch (final UnknownHostException e) {
// add new entries
NAME_CACHE_MISS.put(host, PRESENT);
LOOKUP_SYNC.remove(host);
return null;
}
if ((ip != null) &&
(!ip.isLoopbackAddress()) &&
(!matchesList(host, nameCacheNoCachingPatterns))
) {
// add new entries
NAME_CACHE_HIT.put(host, ip);
}
LOOKUP_SYNC.remove(host);
return ip;
} catch (final UnknownHostException e) {
// remove old entries
flushMissNameCache();
// add new entries
NAME_CACHE_MISS.put(host, PRESENT);
}
return null;
}
private final static Pattern dotPattern = Pattern.compile("\\.");
private static final InetAddress parseInetAddress(final String ip) {
private static final InetAddress parseInetAddress(String ip) {
if (ip == null || ip.length() < 8) return null;
if (ip.equals("0:0:0:0:0:0:0:1%0")) ip = "127.0.0.1";
final String[] ips = dotPattern.split(ip);
if (ips.length != 4) return null;
final byte[] ipb = new byte[4];
@ -567,22 +579,6 @@ public class Domains {
return NAME_CACHE_MISS.size();
}
/**
* Returns the number of entries in the nameCacheNoCachingList list
*
* @return int The number of entries in the nameCacheNoCachingList list
*/
public static int nameCacheNoCachingListSize() {
return NAME_CACHE_NO_CACHING_LIST.size();
}
/**
* Removes old entries from the dns miss cache
*/
public static void flushMissNameCache() {
if (NAME_CACHE_MISS.size() > MAX_NAME_CACHE_MISS_SIZE) NAME_CACHE_MISS.clear();
}
private static String localHostName = "127.0.0.1";
private static Set<InetAddress> localHostAddresses = new HashSet<InetAddress>();
private static Set<String> localHostNames = new HashSet<String>();

@ -201,9 +201,12 @@ public final class Condenser {
// images
final Iterator<ImageEntry> j = document.getImages().values().iterator();
ImageEntry ientry;
MultiProtocolURI url;
while (j.hasNext()) {
ientry = j.next();
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
url = ientry.url();
if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
}

@ -481,7 +481,7 @@ dc_rights
final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Iterator<?> i = links.iterator();
Object o;
MultiProtocolURI url;
MultiProtocolURI url = null;
String u;
int pos;
loop: while (i.hasNext())
@ -495,8 +495,9 @@ dc_rights
url = ((ImageEntry) o).url();
else {
assert false;
continue;
continue loop;
}
if (url == null) continue loop;
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
i.remove();

@ -90,7 +90,7 @@ public class MediawikiImporter extends Thread implements Importer {
private String hostport, urlStub;
public MediawikiImporter(File sourcefile, File targetdir) throws MalformedURLException {
public MediawikiImporter(File sourcefile, File targetdir) {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
@ -762,8 +762,6 @@ public class MediawikiImporter extends Thread implements Importer {
mi.join();
} catch (InterruptedException e) {
Log.logException(e);
} catch (IOException e) {
Log.logException(e);
}
}

Loading…
Cancel
Save