- fixed 'delete from subpath' during crawl start which deleted nothing;

now works;
- changed some crawl start html design details
pull/1/head
Michael Peter Christen 12 years ago
parent 899fd8b62d
commit fb0fa9a102

@ -51,6 +51,10 @@
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td> <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td> <td>
<textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea> <textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</td> </td>
</tr> </tr>
<tr> <tr>
@ -79,13 +83,6 @@
<td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td> <td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
<td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td> <td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
</tr> </tr>
<tr>
<td colspan="3" class="commit">
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</td>
</tr>
</table> </table>
</td> </td>
<td colspan="3"> <td colspan="3">
@ -150,8 +147,8 @@
<dl> <dl>
<dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt> <dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
<dd>Do not delete any document before the crawl is started.</dd> <dd>Do not delete any document before the crawl is started.</dd>
<dt>Delete start host<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt> <dt>Delete sub-path<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
<dd>For each host in the start url list, delete all documents from that host.</dd> <dd>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt> <dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
<dd>Treat documents that are loaded <dd>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber"> <select name="deleteIfOlderNumber" id="deleteIfOlderNumber">

@ -294,7 +294,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs); siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) { if (deleteold) {
for (DigestURI u: rootURLs) { for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0); int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
} }
} }
@ -302,9 +302,9 @@ public class Crawler_p {
siteFilter = CrawlProfile.subpathFilter(rootURLs); siteFilter = CrawlProfile.subpathFilter(rootURLs);
if (deleteold) { if (deleteold) {
for (DigestURI u: rootURLs) { for (DigestURI u: rootURLs) {
String subpath = CrawlProfile.mustMatchSubpath(u); String basepath = u.toNormalform(true);
if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2); if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0); int count = sb.index.fulltext().remove(basepath, deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
} }
} }

@ -345,7 +345,7 @@ public final class Fulltext implements Iterable<byte[]> {
synchronized (Fulltext.this.solr) { synchronized (Fulltext.this.solr) {
try { try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q)); count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
Fulltext.this.solr.commit(); if (count.get() > 0) Fulltext.this.solr.commit();
} catch (IOException e) {} } catch (IOException e) {}
} }
@ -392,13 +392,11 @@ public final class Fulltext implements Iterable<byte[]> {
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @param concurrently if true, then the method returnes immediately and runs concurrently * @param concurrently if true, then the method returnes immediately and runs concurrently
*/ */
public int remove(String subpath, Date freshdate, final boolean concurrently) { public int remove(final String basepath, Date freshdate, final boolean concurrently) {
int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/');
final String path = p > 8 ? subpath.substring(0, p + 1) : subpath;
DigestURI uri; DigestURI uri;
try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;} try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;}
final String host = uri.getHost(); final String host = uri.getHost();
final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host + final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread(){ Thread t = new Thread(){
@ -408,7 +406,7 @@ public final class Fulltext implements Iterable<byte[]> {
SolrDocument doc; SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (u.startsWith(path)) { if (u.startsWith(basepath)) {
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
count.incrementAndGet(); count.incrementAndGet();
} }

Loading…
Cancel
Save