- fixed 'delete from subpath' during crawl start which deleted nothing;

now works;
- changed some crawl start html design details
pull/1/head
Michael Peter Christen 12 years ago
parent 899fd8b62d
commit fb0fa9a102

@ -51,6 +51,10 @@
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</td>
</tr>
<tr>
@ -79,13 +83,6 @@
<td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
<td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
</tr>
<tr>
<td colspan="3" class="commit">
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</td>
</tr>
</table>
</td>
<td colspan="3">
@ -150,8 +147,8 @@
<dl>
<dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
<dd>Do not delete any document before the crawl is started.</dd>
<dt>Delete start host<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
<dd>For each host in the start url list, delete all documents from that host.</dd>
<dt>Delete sub-path<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
<dd>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
<dd>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">

@ -294,7 +294,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0);
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}
@ -302,9 +302,9 @@ public class Crawler_p {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
String subpath = CrawlProfile.mustMatchSubpath(u);
if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0);
String basepath = u.toNormalform(true);
if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
int count = sb.index.fulltext().remove(basepath, deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}

@ -345,7 +345,7 @@ public final class Fulltext implements Iterable<byte[]> {
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
Fulltext.this.solr.commit();
if (count.get() > 0) Fulltext.this.solr.commit();
} catch (IOException e) {}
}
@ -392,13 +392,11 @@ public final class Fulltext implements Iterable<byte[]> {
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @param concurrently if true, then the method returnes immediately and runs concurrently
*/
public int remove(String subpath, Date freshdate, final boolean concurrently) {
int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/');
final String path = p > 8 ? subpath.substring(0, p + 1) : subpath;
public int remove(final String basepath, Date freshdate, final boolean concurrently) {
DigestURI uri;
try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;}
try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;}
final String host = uri.getHost();
final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host +
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread(){
@ -408,7 +406,7 @@ public final class Fulltext implements Iterable<byte[]> {
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (u.startsWith(path)) {
if (u.startsWith(basepath)) {
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
count.incrementAndGet();
}

Loading…
Cancel
Save