diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java
index b6b982072..19081f27c 100644
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@@ -127,13 +127,8 @@ public class CrawlResults {
final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) {
- // delete all urls for this domain from database
- try {
- sb.index.fulltext().deleteDomain(hashpart);
- ResultURLs.deleteDomain(tabletype, domain, hashpart);
- } catch (final IOException e) {
- Log.logException(e);
- }
+ sb.index.fulltext().deleteDomain(hashpart, false);
+ ResultURLs.deleteDomain(tabletype, domain, hashpart);
}
}
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 59a7dff61..5a703e168 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -153,10 +153,11 @@
:
- Use filter
+ Use filter
- Restrict to start domain
- Restrict to sub-path
+ Restrict to start domain
+ Restrict to sub-path
+ Delete all old documents in domain/subpath
The filter is a regular expression
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index 15978b66b..76dc82f57 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -81,6 +81,7 @@
load all files in domain
load only files in a sub-path of given url
+
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index c54b08494..454eafd72 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -150,7 +150,8 @@ public class Crawler_p {
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
-
+ final boolean deleteold = (fullDomain || subPath) && post.getBoolean("deleteold");
+
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
Set rootURLs = new HashSet();
@@ -301,8 +302,18 @@ public class Crawler_p {
String siteFilter = ".*";
if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs);
+ if (deleteold) {
+ for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true);
+ }
} else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
+ if (deleteold) {
+ for (DigestURI u: rootURLs) {
+ String subpath = CrawlProfile.mustMatchSubpath(u);
+ if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
+ sb.index.fulltext().remove(subpath, true);
+ }
+ }
}
if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
newcrawlingMustMatch = siteFilter;
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index 1384d81c8..19b537348 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -297,12 +297,7 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart");
- try {
- segment.fulltext().deleteDomain(hp);
- } catch (final IOException e) {
- // TODO Auto-generated catch block
- Log.logException(e);
- }
+ segment.fulltext().deleteDomain(hp, false);
// trigger the loading of the table
post.put("statistics", "");
}
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 54699513a..17f1b2162 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -482,6 +482,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
+ public static String siteFilter(final Set extends MultiProtocolURI> uris) {
+ final StringBuilder filter = new StringBuilder();
+ for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
+ return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
+ }
+
public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) {
String host = uri.getHost();
if (host.startsWith("www.")) host = host.substring(4);
@@ -490,24 +496,18 @@ public class CrawlProfile extends ConcurrentHashMap implements M
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
}
- private static String mustMatchSubpath(final MultiProtocolURI uri) {
- String u = uri.toNormalform(true);
- if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
- return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
- }
-
- public static String siteFilter(final Set extends MultiProtocolURI> uris) {
- final StringBuilder filter = new StringBuilder();
- for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
- return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
- }
-
public static String subpathFilter(final Set extends MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri));
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
}
+ public static String mustMatchSubpath(final MultiProtocolURI uri) {
+ String u = uri.toNormalform(true);
+ if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
+ return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
+ }
+
public static final Set ignoreNames = new HashSet();
static {
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 1042e432a..912250f97 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -795,44 +795,53 @@ public final class Fulltext implements Iterable {
* @return number of deleted domains
* @throws IOException
*/
- public int deleteDomain(final String hosthash) throws IOException {
+ public void deleteDomain(final String hosthash, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
- // delete in solr
- synchronized (this.solr) {
- this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
- }
-
- // delete in old metadata structure
- final ArrayList l = new ArrayList();
- synchronized (this) {
- final CloneableIterator i = this.urlIndexFile.keys(true, null);
- String hash;
- while (i != null && i.hasNext()) {
- hash = ASCII.String(i.next());
- if (hosthash.equals(hash.substring(6))) l.add(hash);
- }
- }
-
- // then delete the urls using this list
- int cnt = 0;
- for (final String h: l) {
- if (this.urlIndexFile.delete(ASCII.getBytes(h))) cnt++;
- }
-
- // finally remove the line with statistics
- if (this.statsDump != null) {
- final Iterator hsi = this.statsDump.iterator();
- HostStat hs;
- while (hsi.hasNext()) {
- hs = hsi.next();
- if (hs.hosthash.equals(hosthash)) {
- hsi.remove();
- break;
+
+ Thread t = new Thread() {
+ public void run() {
+ // delete in solr
+ synchronized (Fulltext.this.solr) {
+ try {
+ Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
+ Fulltext.this.solr.commit();
+ } catch (IOException e) {}
+ }
+
+ // delete in old metadata structure
+ if (Fulltext.this.urlIndexFile != null) {
+ final ArrayList l = new ArrayList();
+ synchronized (this) {
+ CloneableIterator i;
+ try {
+ i = Fulltext.this.urlIndexFile.keys(true, null);
+ String hash;
+ while (i != null && i.hasNext()) {
+ hash = ASCII.String(i.next());
+ if (hosthash.equals(hash.substring(6))) l.add(hash);
+ }
+
+ // then delete the urls using this list
+ for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
+ } catch (IOException e) {}
+ }
+ }
+
+ // finally remove the line with statistics
+ if (Fulltext.this.statsDump != null) {
+ final Iterator hsi = Fulltext.this.statsDump.iterator();
+ HostStat hs;
+ while (hsi.hasNext()) {
+ hs = hsi.next();
+ if (hs.hosthash.equals(hosthash)) {
+ hsi.remove();
+ break;
+ }
+ }
}
}
- }
-
- return cnt;
+ };
+ if (concurrent) t.start(); else t.run();
}
}