enhanced url constraint computation: better position of constraint check during retrieval process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6272 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · b0637600d5
parent 3ebb228ea1
commit b0637600d5
3 changed files with 145 additions and 158 deletions
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@ -53,6 +53,12 @@ public final class QueryParams {
    public static final int CONTENTDOM_VIDEO = 3;
    public static final int CONTENTDOM_APP   = 4;
    
+    public static enum FetchMode {
+    	NO_FETCH_NO_VERIFY,
+    	FETCH_BUT_ACCEPT_OFFLINE_OR_USE_CACHE,
+    	FETCH_AND_VERIFY_ONLINE;
+    }
+    
    public static final Bitfield empty_constraint    = new Bitfield(4, "AAAAAA");
    public static final Bitfield catchall_constraint = new Bitfield(4, "______");
    
--- a/source/de/anomic/search/RankingProcess.java
+++ b/source/de/anomic/search/RankingProcess.java
@ -328,47 +328,80 @@ public final class RankingProcess extends Thread {
            if (((stack.size() == 0) && (size() == 0))) break;
            final SortStack<WordReferenceVars>.stackElement obrwi = takeRWI(skipDoubleDom);
            if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
-            final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
-            if (u != null) {
-                final URLMetadataRow.Components metadata = u.metadata();
-
-                // TODO: check url constraints
-                
-                
-                // evaluate information of metadata for navigation
-                // author navigation:
-                String author = metadata.dc_creator();
-                if (author != null && author.length() > 0) {
-                	// add author to the author navigator
-                    String authorhash = new String(Word.word2hash(author));
-                    //System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author);
-                    
-                    // check if we already are filtering for authors
-                	if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
-                		continue;
-                	}
-                	
-                	// add author to the author navigator
-                    AuthorInfo in = this.authorNavigator.get(authorhash);
-                    if (in == null) {
-                        this.authorNavigator.put(authorhash, new AuthorInfo(author));
-                    } else {
-                        in.inc();
-                        this.authorNavigator.put(authorhash, in);
-                    }
-                } else if (this.query.authorhash != null) {
-                	continue;
-                }
+            final URLMetadataRow page = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
+            if (page == null) {
+            	misses.add(obrwi.element.metadataHash());
+            	continue;
+            }
+            
+            // prepare values for constraint check
+            final URLMetadataRow.Components metadata = page.metadata();
+            
+            // check url constraints
+            if (metadata.url() == null) {
+                continue; // rare case where the url is corrupted
+            }
+            
+            final String pageurl = metadata.url().toNormalform(true, true);
+            final String pageauthor = metadata.dc_creator();
+            final String pagetitle = metadata.dc_title().toLowerCase();
+            
+            // check exclusion
+            if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
+                (QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) ||
+                (QueryParams.matches(pageauthor.toLowerCase(), query.excludeHashes))) {
+                continue;
+            }
+            
+            // check url mask
+            if (!(pageurl.matches(query.urlMask))) {
+                continue;
+            }
+            
+            // check index-of constraint
+            if ((query.constraint != null) &&
+                (query.constraint.get(Condenser.flag_cat_indexof)) &&
+                (!(pagetitle.startsWith("index of")))) {
+                final Iterator<byte[]> wi = query.queryHashes.iterator();
+                while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
+                continue;
+            }
+            
+            // check content domain
+            if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO && page.laudio() == 0) ||
+                (query.contentdom == QueryParams.CONTENTDOM_VIDEO && page.lvideo() == 0) ||
+                (query.contentdom == QueryParams.CONTENTDOM_IMAGE && page.limage() == 0) ||
+                (query.contentdom == QueryParams.CONTENTDOM_APP && page.lapp() == 0)) {
+            	continue;
+            }
+            
+            // evaluate information of metadata for navigation
+            // author navigation:
+            if (pageauthor != null && pageauthor.length() > 0) {
+            	// add author to the author navigator
+                String authorhash = new String(Word.word2hash(pageauthor));
+                //System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author);
                
-                // get the url
-                if (metadata.url() != null) {
-                	String urlstring = metadata.url().toNormalform(true, true);
-                	if (urlstring == null || !urlstring.matches(query.urlMask)) continue;                    
-                    this.handover.add(u.hash()); // remember that we handed over this url
-                    return u;
+                // check if we already are filtering for authors
+            	if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
+            		continue;
+            	}
+            	
+            	// add author to the author navigator
+                AuthorInfo in = this.authorNavigator.get(authorhash);
+                if (in == null) {
+                    this.authorNavigator.put(authorhash, new AuthorInfo(pageauthor));
+                } else {
+                    in.inc();
+                    this.authorNavigator.put(authorhash, in);
                }
+            } else if (this.query.authorhash != null) {
+            	continue;
            }
-            misses.add(obrwi.element.metadataHash());
+            
+            // accept url
+            this.handover.add(page.hash()); // remember that we handed over this url
+            return page;
        }
        return null;
    }
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@ -29,7 +29,6 @@ package de.anomic.search;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.TreeSet;

 import de.anomic.document.Condenser;
@ -66,7 +65,7 @@ public class ResultFetcher {
    
    
    @SuppressWarnings("unchecked")
-    ResultFetcher(
+    public ResultFetcher(
            RankingProcess rankedCache,
            final QueryParams query,
            final Segment indexSegment,
@ -112,123 +111,7 @@ public class ResultFetcher {
        }
    }
    
-    protected ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
-
-        // a search result entry needs some work to produce a result Entry:
-        // - check if url entry exists in LURL-db
-        // - check exclusions, constraints, masks, media-domains
-        // - load snippet (see if page exists) and check if snippet contains searched word
-
-        // Snippet Fetching can has 3 modes:
-        // 0 - do not fetch snippets
-        // 1 - fetch snippets offline only
-        // 2 - online snippet fetch
-        
-        // load only urls if there was not yet a root url of that hash
-        // find the url entry
-
-        long startTime = System.currentTimeMillis();
-        final URLMetadataRow.Components metadata = page.metadata();
-        final String pagetitle = metadata.dc_title().toLowerCase();
-        if (metadata.url() == null) {
-            registerFailure(page.hash(), "url corrupted (null)");
-            return null; // rare case where the url is corrupted
-        }
-        final String pageurl = metadata.url().toString().toLowerCase();
-        final String pageauthor = metadata.dc_creator().toLowerCase();
-        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
-        
-        // check exclusion
-        if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
-            (QueryParams.matches(pageurl, query.excludeHashes)) ||
-            (QueryParams.matches(pageauthor, query.excludeHashes))) {
-            return null;
-        }
-            
-        // check url mask
-        if (!(pageurl.matches(query.urlMask))) {
-            return null;
-        }
-            
-        // check constraints
-        if ((query.constraint != null) &&
-            (query.constraint.get(Condenser.flag_cat_indexof)) &&
-            (!(metadata.dc_title().startsWith("Index of")))) {
-            final Iterator<byte[]> wi = query.queryHashes.iterator();
-            while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
-            registerFailure(page.hash(), "index-of constraint not fullfilled");
-            return null;
-        }
-        
-        if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (page.laudio() == 0)) {
-            registerFailure(page.hash(), "contentdom-audio constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) {
-            registerFailure(page.hash(), "contentdom-video constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (page.limage() == 0)) {
-            registerFailure(page.hash(), "contentdom-image constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_APP) && (page.lapp() == 0)) {
-            registerFailure(page.hash(), "contentdom-app constraint not fullfilled");
-            return null;
-        }
-
-        if (snippetFetchMode == 0) {
-            return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
-        }
-        
-        // load snippet
-        if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
-            // attach text snippet
-            startTime = System.currentTimeMillis();
-            final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
-            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
-            
-            if (snippet.getErrorCode() < 11) {
-                // we loaded the file and found the snippet
-                return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
-            } else if (snippetFetchMode == 1) {
-                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
-                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
-                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
-            } else {
-                // problems with snippet fetch
-                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
-                if (!peers.mySeed().isVirgin())
-                    try {
-                        TextSnippet.failConsequences(snippet, query.id(false));
-                    } catch (IOException e) {
-                        e.printStackTrace();
-                    }
-                return null;
-            }
-        } else {
-            // attach media information
-            startTime = System.currentTimeMillis();
-            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
-            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
-            
-            if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
-                // found media snippets, return entry
-                return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
-            } else if (snippetFetchMode == 1) {
-                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
-            } else {
-                // problems with snippet fetch
-                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
-                return null;
-            }
-        }
-        // finished, no more actions possible here
-    }
-    
-    boolean anyWorkerAlive() {
+   boolean anyWorkerAlive() {
        if (this.workerThreads == null) return false;
        for (int i = 0; i < this.workerThreads.length; i++) {
           if ((this.workerThreads[i] != null) &&
@ -281,7 +164,8 @@ public class ResultFetcher {
                    if (result.exists(page.hash().hashCode())) continue;
                    if (failedURLs.get(page.hash()) != null) continue;
                    
-                    final ResultEntry resultEntry = obtainResultEntry(page, snippetMode);
+                    final ResultEntry resultEntry = fetchSnippet(page, snippetMode);
+                    
                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
                    urlRetrievalAllTime += resultEntry.dbRetrievalTime;
                    snippetComputationAllTime += resultEntry.snippetComputationTime;
@ -305,6 +189,70 @@ public class ResultFetcher {
        }
    }
    
+    protected ResultEntry fetchSnippet(final URLMetadataRow page, final int snippetMode) {
+        // Snippet Fetching can has 3 modes:
+        // 0 - do not fetch snippets
+        // 1 - fetch snippets offline only
+        // 2 - online snippet fetch
+        
+        // load only urls if there was not yet a root url of that hash
+        // find the url entry
+
+        long startTime = System.currentTimeMillis();
+        final URLMetadataRow.Components metadata = page.metadata();
+        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
+        
+        if (snippetMode == 0) {
+            return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
+        }
+        
+        // load snippet
+        if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
+            // attach text snippet
+            startTime = System.currentTimeMillis();
+            final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
+            final long snippetComputationTime = System.currentTimeMillis() - startTime;
+            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
+            
+            if (snippet.getErrorCode() < 11) {
+                // we loaded the file and found the snippet
+                return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
+            } else if (snippetMode == 1) {
+                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
+                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
+                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
+            } else {
+                // problems with snippet fetch
+                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
+                if (!peers.mySeed().isVirgin())
+                    try {
+                        TextSnippet.failConsequences(snippet, query.id(false));
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                return null;
+            }
+        } else {
+            // attach media information
+            startTime = System.currentTimeMillis();
+            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
+            final long snippetComputationTime = System.currentTimeMillis() - startTime;
+            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
+            
+            if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
+                // found media snippets, return entry
+                return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
+            } else if (snippetMode == 1) {
+                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
+            } else {
+                // problems with snippet fetch
+                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
+                return null;
+            }
+        }
+        // finished, no more actions possible here
+    }
+    
    private void registerFailure(final String urlhash, final String reason) {
        this.failedURLs.put(urlhash, reason);
        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);