- some layout and text enhancement for site crawl start

- Quix0rs patch from http://forum.yacy-websuche.de/viewtopic.php?p=20839#p20839 (parts)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7163 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 8c1da27347
commit 29fe401f93

@ -25,6 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Date;
import java.util.Iterator;
@ -53,7 +54,7 @@ public class AccessTracker_p {
private static SimpleDateFormat SimpleFormatter = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US);
private static final List<Track> listclone(final List<Track> m) {
private static final Collection<Track> listclone(final Collection<Track> m) {
final List<Track> accessClone = new LinkedList<Track>();
try {
accessClone.addAll(m);
@ -76,7 +77,7 @@ public class AccessTracker_p {
if (page == 0) {
final Iterator<String> i = sb.accessHosts();
String host;
List<Track> access;
Collection<Track> access;
int entCount = 0;
try {
while ((entCount < maxCount) && (i.hasNext())) {
@ -106,7 +107,7 @@ public class AccessTracker_p {
if (page == 1) {
String host = (post == null) ? "" : post.get("host", "");
int entCount = 0;
List<Track> access;
Collection<Track> access;
Track entry;
if (host.length() > 0) {
access = sb.accessTrack(host);

@ -69,10 +69,10 @@
<option value="seldays" selected="selected">days</option>
</select> for new documents automatically.
</dd>
<dt><label>Path in Domain</label></dt>
<dt><label>Path</label></dt>
<dd>
<input type="radio" name="range" id="rangeDomain" value="domain" checked="checked"/>full domain<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" />only sub-path of given url
<input type="radio" name="range" id="rangeDomain" value="domain" checked="checked"/>load all files in domain<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" />load only files in a sub-path of given url
</dd>
<input type="hidden" name="mustnotmatch" id="mustnotmatch" value="">
<input type="hidden" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" value="off">
@ -85,7 +85,7 @@
</tr></table>
</dd>
<dt><label>Dynamic URLs</label></dt>
<dd><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow '?' in path
<dd><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
</dd>
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on">
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh">

@ -72,6 +72,7 @@ h3 {
h4 {
font-size:1.1em;
margin-bottom: 4px;
}
a:link {

@ -5,7 +5,7 @@
<div class="SubMenugroup">
<h3>Crawler/Spider</h3>
<ul class="SubMenu">
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Site Crawl<br/>&nbsp;</a></li>
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Full Site Crawl/<br/>Sitemap Loader</a></li>
<li><a href="/CrawlStartExpert_p.html" class="MenuItemLink lock">Crawl Start<br/>(Expert)</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Crawling of<br/>Media Wikis</a></li>
<li><a href="/Load_PHPBB3.html" class="MenuItemLink">Crawling of<br/>phpBB3 Forums</a></li>

@ -382,7 +382,9 @@ public final class RankingProcess extends Thread {
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
this.hostNavigator.inc(new String(urlhash, 6, 6), new String(urlhash));
if (query.sitehash == null) {
this.hostNavigator.inc(new String(urlhash, 6, 6), new String(urlhash));
}
}
// check for more errors

@ -43,6 +43,7 @@ import net.yacy.kelondro.util.EventTracker;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.search.MediaSnippet;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph;
@ -168,7 +169,7 @@ public class ResultFetcher {
if (page == null) break;
if (failedURLs.has(page.hash())) continue;
final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
final ResultEntry resultEntry = fetchSnippet(page, query.host == null ? cacheStrategy : CacheStrategy.CACHEONLY); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used
//if (result.contains(resultEntry)) continue;

@ -20,11 +20,11 @@
package de.anomic.server;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.kelondro.logging.Log;
@ -35,7 +35,7 @@ public class serverAccessTracker {
private final long maxTrackingTime;
private final int maxTrackingCount;
private final int maxHostCount;
private final ConcurrentHashMap<String, List<Track>> accessTracker; // mappings from requesting host to an ArrayList of serverTrack-entries
private final ConcurrentHashMap<String, Collection<Track>> accessTracker; // mappings from requesting host to an ArrayList of serverTrack-entries
private long lastCleanup;
public static class Track {
@ -57,7 +57,7 @@ public class serverAccessTracker {
this.maxTrackingTime = maxTrackingTime;
this.maxTrackingCount = maxTrackingCount;
this.maxHostCount = maxTrackingHostCount;
this.accessTracker = new ConcurrentHashMap<String, List<Track>>();
this.accessTracker = new ConcurrentHashMap<String, Collection<Track>>();
}
/*
@ -68,8 +68,8 @@ public class serverAccessTracker {
if (System.currentTimeMillis() - this.lastCleanup < cleanupCycle) return;
// clear entries which had no entry for the maxTrackingTime time
final Iterator<Map.Entry<String, List<Track>>> i = accessTracker.entrySet().iterator();
List<Track> track;
final Iterator<Map.Entry<String, Collection<Track>>> i = accessTracker.entrySet().iterator();
Collection<Track> track;
while (i.hasNext()) {
track = i.next().getValue();
if (tailList(track, Long.valueOf(System.currentTimeMillis() - maxTrackingTime)).isEmpty()) {
@ -93,46 +93,40 @@ public class serverAccessTracker {
this.lastCleanup = System.currentTimeMillis();
}
public static List<Track> tailList(List<Track> timeList, long time) {
List<Track> t = new LinkedList<Track>();
public static Collection<Track> tailList(Collection<Track> timeList, long time) {
Collection<Track> t = new ConcurrentLinkedQueue<Track>();
for (Track l: timeList) if (l.getTime() > time) t.add(l);
return t;
}
private List<Track> clearTooOldAccess(final List<Track> access) {
private Collection<Track> clearTooOldAccess(final Collection<Track> access) {
try {
return tailList(access, Long.valueOf(System.currentTimeMillis() - maxTrackingTime));
} catch (IllegalArgumentException e) {
Log.logException(e);
return new LinkedList<Track>();
return new ConcurrentLinkedQueue<Track>();
}
}
public void track(final String host, String accessPath) {
// check storage size
if (System.currentTimeMillis() - this.lastCleanup > cleanupCycle) synchronized (this) {
if (System.currentTimeMillis() - this.lastCleanup > cleanupCycle) {
cleanupAccessTracker();
this.lastCleanup = System.currentTimeMillis();
}
if (System.currentTimeMillis() - this.lastCleanup > cleanupCycle) {
cleanupAccessTracker();
}
// learn that a specific host has accessed a specific path
if (accessPath == null) accessPath="NULL";
List<Track> track = accessTracker.get(host);
if (track == null) track = new LinkedList<Track>();
synchronized (track) {
track.add(new Track(System.currentTimeMillis(), accessPath));
// write back to tracker
accessTracker.put(host, clearTooOldAccess(track));
}
Collection<Track> track = accessTracker.get(host);
if (track == null) track = new ConcurrentLinkedQueue<Track>();
track.add(new Track(System.currentTimeMillis(), accessPath));
// write back to tracker
accessTracker.put(host, clearTooOldAccess(track));
}
public List<Track> accessTrack(final String host) {
public Collection<Track> accessTrack(final String host) {
// returns mapping from Long(accesstime) to path
List<Track> access = accessTracker.get(host);
Collection<Track> access = accessTracker.get(host);
if (access == null) return null;
// clear too old entries
synchronized (access) {
@ -150,7 +144,7 @@ public class serverAccessTracker {
public Iterator<String> accessHosts() {
// returns an iterator of hosts in tracker (String)
final Map<String, List<Track>> accessTrackerClone = new ConcurrentHashMap<String, List<Track>>();
final Map<String, Collection<Track>> accessTrackerClone = new ConcurrentHashMap<String, Collection<Track>>();
accessTrackerClone.putAll(accessTracker);
return accessTrackerClone.keySet().iterator();
}

@ -25,8 +25,8 @@ import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
@ -541,7 +541,7 @@ public class serverSwitch {
this.accessTracker.track(host, accessPath);
}
public List<Track> accessTrack(String host) {
public Collection<Track> accessTrack(String host) {
return this.accessTracker.accessTrack(host);
}

Loading…
Cancel
Save