- fixed 'delete from subpath' during crawl start which deleted nothing;

now works; - changed some crawl start html design details
12 years ago · fb0fa9a102
parent 899fd8b62d
commit fb0fa9a102
3 changed files with 15 additions and 20 deletions
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -51,6 +51,10 @@
                <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
                <td>
                  <textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
+                    &nbsp;
+                    <span id="robotsOK"></span>
+	              	<span id="title"><br/></span>
+	              	<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
                </td>
              </tr>
              <tr>
@ -79,13 +83,6 @@
                <td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
                <td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
              </tr>
-              <tr>
-              	<td colspan="3" class="commit">
-	              	<span id="robotsOK"></span>
-	              	<span id="title"><br/></span>
-	              	<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
-              	</td>
-              </tr>
            </table>
          </td>
          <td colspan="3">
@ -150,8 +147,8 @@
            <dl>
            <dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
            <dd>Do not delete any document before the crawl is started.</dd>
-            <dt>Delete start host<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
-            <dd>For each host in the start url list, delete all documents from that host.</dd>
+            <dt>Delete sub-path<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
+            <dd>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
 			<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
 			<dd>Treat documents that are loaded
 			<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -294,7 +294,7 @@ public class Crawler_p {
                        siteFilter = CrawlProfile.siteFilter(rootURLs);
                        if (deleteold) {
                            for (DigestURI u: rootURLs) {
-                                int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0);
+                                int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
                                if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
                            }
                        }
@ -302,9 +302,9 @@ public class Crawler_p {
                        siteFilter = CrawlProfile.subpathFilter(rootURLs);
                        if (deleteold) {
                            for (DigestURI u: rootURLs) {
-                                String subpath = CrawlProfile.mustMatchSubpath(u);
-                                if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
-                                int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0);
+                                String basepath = u.toNormalform(true);
+                                if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
+                                int count = sb.index.fulltext().remove(basepath, deleteageDate, rootURLs.size() > 1);
                                if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
                            }
                        }
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -345,7 +345,7 @@ public final class Fulltext implements Iterable<byte[]> {
                synchronized (Fulltext.this.solr) {
                    try {
                        count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
-                        Fulltext.this.solr.commit();
+                        if (count.get() > 0) Fulltext.this.solr.commit();
                    } catch (IOException e) {}
                }
        
@ -392,13 +392,11 @@ public final class Fulltext implements Iterable<byte[]> {
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
     * @param concurrently if true, then the method returnes immediately and runs concurrently
     */
-    public int remove(String subpath, Date freshdate, final boolean concurrently) {
-        int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/');
-        final String path = p > 8 ? subpath.substring(0, p + 1) : subpath;
+    public int remove(final String basepath, Date freshdate, final boolean concurrently) {
        DigestURI uri;
-        try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;}
+        try {uri = new DigestURI(basepath);} catch (MalformedURLException e) {return 0;}
        final String host = uri.getHost();
-        final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host +
+        final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
                ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
        final AtomicInteger count = new AtomicInteger(0);
        Thread t = new Thread(){
@ -408,7 +406,7 @@ public final class Fulltext implements Iterable<byte[]> {
                    SolrDocument doc;
                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                        String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
-                        if (u.startsWith(path)) {
+                        if (u.startsWith(basepath)) {
                            remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
                            count.incrementAndGet();
                        }