Merge branch 'master' of https://git.gitorious.org/yacy/rc1.git

14 years ago · 0aa5e134ea
parent 3b70ff7046 eb1c7c041d
commit 0aa5e134ea
20 changed files with 202 additions and 183 deletions
--- a/htroot/Bookmarks.html
+++ b/htroot/Bookmarks.html
@ -229,7 +229,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
 					/
 					<a href="Bookmarks.html?delete=#[hash]#" class="bookmarkAction" onclick="return confirm('Confirm deletion')">Delete</a>
 					/
-					<a href="/api/util/getpageinfo_p.xml?url=#[link]#" class="bookmarkAction">Info</a>
+					<a href="/api/getpageinfo_p.xml?url=#[link]#" class="bookmarkAction">Info</a>
 				</p>
 			</div>
 			#{/bookmarks}#
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -17,6 +17,9 @@
    </style>
  </head>
  <body id="IndexCreate">
+  
+<div id="api"></div>
+  
    #%env/templates/header.template%#
    #%env/templates/submenuIndexCreate.template%#
    <h2>Expert Crawl Start</h2>
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -17,6 +17,9 @@
    </style>
  </head>
  <body id="IndexCreate">
+  
+<div id="api"></div>
+  
    #%env/templates/header.template%#
    #%env/templates/submenuIndexCreate.template%#
    <h2>Site Crawling</h2>
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@ -7,6 +7,14 @@
  <script type="text/javascript" src="/js/highslide/highslide.js"></script>
  </head>
  <body>
+  
+<div id="api">
+<a href="http://localhost:8090/api/getpageinfo_p.xml?actions=title,robots&url=#[url]#" id="apilink">
+<img src="/env/grafics/api.png" width="60" height="40" alt="API"/>
+</a>
+<span>See the page info about the url.</span>
+</div>
+  
    #(display)#
    #%env/templates/simpleheader.template%#
    ::
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -8,6 +8,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import de.anomic.crawler.RobotsTxtEntry;
 import de.anomic.server.serverObjects;
@ -24,21 +25,23 @@ public class getpageinfo_p {
        prop.put("desc", "");
        prop.put("lang", "");
        prop.put("robots-allowed", "3"); //unknown
+        prop.put("robotsInfo", ""); //unknown
        prop.put("sitemap", "");
        prop.put("favicon","");
        prop.put("sitelist", "");
        prop.put("filter", ".*");

        // default actions
-        String actions="title,robots";
+        String actions = "title,robots";

        if (post != null && post.containsKey("url")) {
-            if(post.containsKey("actions"))
+            if (post.containsKey("actions"))
                actions=post.get("actions");
            String url=post.get("url");
-			if(url.toLowerCase().startsWith("ftp://")){
+			if (url.toLowerCase().startsWith("ftp://")) {
 				prop.put("robots-allowed", "1");
-				prop.putXML("title", "FTP: "+url);
+		        prop.put("robotsInfo", "ftp does not follow robots.txt");
+				prop.putXML("title", "FTP: " + url);
                return prop;
 			} else if (!url.startsWith("http://") &&
 		               !url.startsWith("https://") &&
@ -47,18 +50,18 @@ public class getpageinfo_p {
 		              !url.startsWith("file://")) {
                url = "http://" + url;
            }
-            if (actions.indexOf("title")>=0) {
+            if (actions.indexOf("title") >= 0) {
                DigestURI u = null;
                try {
                    u = new DigestURI(url);
                } catch (final MalformedURLException e) {
-                    // fail, do nothing
+                    Log.logException(e);
                }
                ContentScraper scraper = null;
                if (u != null) try {
                    scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
                } catch (final IOException e) {
-                    // now thats a fail, do nothing
+                    Log.logException(e);
                }
                if (scraper != null) {
                    // put the document title
@ -68,9 +71,9 @@ public class getpageinfo_p {
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
-                    final String list[]=scraper.getKeywords();
+                    final String list[] = scraper.getKeywords();
                    int count = 0;
-                    for (final String element : list) {
+                    for (final String element: list) {
                        final String tag = element;
                        if (!tag.equals("")) {
                            prop.putXML("tags_"+count+"_tag", tag);
@ -100,7 +103,7 @@ public class getpageinfo_p {
                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
            }
-            if (actions.indexOf("robots")>=0) {
+            if (actions.indexOf("robots") >= 0) {
                try {
                    final DigestURI theURL = new DigestURI(url);

@ -110,13 +113,17 @@ public class getpageinfo_p {
                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                    } catch (final IOException e) {
                        robotsEntry = null;
+                        Log.logException(e);
                    }
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
+                    prop.putHTML("robotsInfo", robotsEntry.getInfo());

                    // get the sitemap URL of the domain
                    final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
                    prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
-                } catch (final MalformedURLException e) {}
+                } catch (final MalformedURLException e) {
+                    Log.logException(e);
+                }
            }

        }
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/util/getpageinfo_p.xml
@ -4,6 +4,7 @@
  <desc>#[desc]#</desc>
  <lang>#[lang]#</lang>
  <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
+  <robotsInfo>#[robotsInfo]#</robotsInfo>
  <sitemap>#[sitemap]#</sitemap>
  <favicon>#[favicon]#</favicon>
  <sitelist>#[sitelist]#</sitelist>
--- a/htroot/api/ymarks/get_metadata.java
+++ b/htroot/api/ymarks/get_metadata.java
@ -1,6 +1,8 @@
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.EnumMap;
+import java.util.Iterator;
+
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.document.Document;
 import net.yacy.document.Parser.Failure;
@ -8,6 +10,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.search.Switchboard;
 import de.anomic.data.UserDB;
 import de.anomic.data.ymark.YMarkAutoTagger;
+import de.anomic.data.ymark.YMarkCrawlStart;
 import de.anomic.data.ymark.YMarkEntry;
 import de.anomic.data.ymark.YMarkMetadata;
 import de.anomic.data.ymark.YMarkTables;
@ -27,18 +30,43 @@ public class get_metadata {
        final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);        
        
        if(isAdmin || isAuthUser) {
+
        	final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);        	
-			try {				
-				final String url = post.get(YMarkEntry.BOOKMARK.URL.key());
+			
+
+        	String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
+        	boolean hasProtocol = false;
+			for (YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
+				if(url.toLowerCase().startsWith(p.protocol())) {
+					hasProtocol = true;
+					break;
+				}
+			}
+			if (!hasProtocol) {
+			    url=YMarkTables.PROTOCOLS.HTTP.protocol(url);
+			}
+    	
+        	try {				
 				YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments);
 				final Document document = meta.loadDocument(sb.loader);
 				final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
 				
 				prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE));
-				prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
-				
+				prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));				
 				prop.put("keywords", putTags(document.dc_subject(','), "keywords"));
 				prop.put("autotags", putTags(YMarkAutoTagger.autoTag(document, 5, sb.tables.bookmarks.getTags(bmk_user)), "autotags"));
+    			
+				final YMarkCrawlStart crawlStart = new YMarkCrawlStart(sb.tables, url);
+    			final Iterator<String> iter = crawlStart.keySet().iterator();
+    			int count = 0;
+    			String key;
+    			while(iter.hasNext()) {
+    				key = iter.next();
+    				prop.putXML("crawlstart_"+count+"_key",key.toLowerCase());
+    				prop.putXML("crawlstart_"+count+"_value",crawlStart.get(key));
+    				count++;
+    			}
+    			prop.put("crawlstart", count);

 			} catch (MalformedURLException e1) {
 				// TODO Auto-generated catch block
--- a/htroot/api/ymarks/get_metadata.xml
+++ b/htroot/api/ymarks/get_metadata.xml
@ -10,4 +10,9 @@
 		#{autotags}#<tag name="#[tag]#" />
 		#{/autotags}#
 	</autotags>
+	<crawlstart 
+		#{crawlstart}##[key]#="#[value]#"
+		#{/crawlstart}#
+	>
+	</crawlstart>
 </info>
--- a/htroot/api/ymarks/get_treeview.java
+++ b/htroot/api/ymarks/get_treeview.java
@ -1,6 +1,5 @@
 import java.io.IOException;
 import java.net.MalformedURLException;
-import java.util.ArrayList;
 import java.util.Date;
 import java.util.EnumMap;
 import java.util.Iterator;
@ -9,18 +8,20 @@ import java.util.TreeMap;
 import net.yacy.cora.date.ISO8601Formatter;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.document.Document;
 import net.yacy.document.Parser.Failure;
 import net.yacy.kelondro.blob.Tables;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import de.anomic.data.UserDB;
+import de.anomic.data.ymark.YMarkAutoTagger;
 import de.anomic.data.ymark.YMarkCrawlStart;
 import de.anomic.data.ymark.YMarkEntry;
 import de.anomic.data.ymark.YMarkMetadata;
 import de.anomic.data.ymark.YMarkTables;
+import de.anomic.data.ymark.YMarkTag;
 import de.anomic.data.ymark.YMarkUtil;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -50,7 +51,7 @@ public class get_treeview {
        	boolean isMetadata = false;
        	boolean isURLdb = false;
        	boolean isCrawlStart = false;
-        	boolean isWordCount = false;
+        	boolean isAutoTagger = false;
        	boolean displayBmk = false;

        	if (post != null){
@ -73,7 +74,7 @@ public class get_treeview {
            			isURLdb = true;
            			isFolder = false;
            		} else if (post.get(ROOT).startsWith("w:")) {
-            			isWordCount = true;
+            			isAutoTagger = true;
            			isFolder = false;
            		} else if (post.get(ROOT).startsWith("c:")) {
            			isCrawlStart = true;
@ -192,7 +193,7 @@ public class get_treeview {
 			            prop.put("folders_"+count+"_hash", "c:"+url);
 			    		prop.put("folders_"+count+"_hasChildren", "true");
 			            count++;
-			            prop.put("folders_"+count+"_foldername","<small><b>WordCounts</b></small>");
+			            prop.put("folders_"+count+"_foldername","<small><b>AutoTagger</b></small>");
 			            putProp(count, "meta");
 			            prop.put("folders_"+count+"_hash", "w:"+url);
 			    		prop.put("folders_"+count+"_hasChildren", "true");
@ -205,23 +206,21 @@ public class get_treeview {
 				} catch (RowSpaceExceededException e) {
 					Log.logException(e);
 				}
-	        } else if (isWordCount || isMetadata || isURLdb || isCrawlStart) {
+	        } else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) {
 	        	try {
 	                final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.indexSegments);
-        			meta.loadDocument(sb.loader);
-	        		if(isWordCount)  {
-        				final TreeMap<String,Word> words = meta.getWordCounts();
-    					final ArrayList<String> topwords = new ArrayList<String>(words.descendingKeySet());
-    					for(int i = 0; i < 20 && i < topwords.size(); i++) {
-    						String word = topwords.get(i);
-    						int occur = words.get(word).occurrences();
-    						prop.put("folders_"+count+"_foldername","<small><b>"+word+":</b> [" + occur + "]</small>");
-        					putProp(count, "meta");
-        					count++;
-    					}
-    					count--;
-    					prop.put("folders_"+count+"_comma", "");
+        			final Document document = meta.loadDocument(sb.loader);
+        			final TreeMap<String, YMarkTag> tags = sb.tables.bookmarks.getTags(bmk_user);
+        			if(isAutoTagger)  {
+        				prop.put("folders_"+count+"_foldername","<small><b>meta-"+YMarkMetadata.METADATA.KEYWORDS.name().toLowerCase()+":</b> " + meta.loadMetadata().get(YMarkMetadata.METADATA.KEYWORDS) + "</small>");
+        				putProp(count, "meta");
+        				count++;
+						prop.put("folders_"+count+"_foldername","<small><b>with preference: </b>"+YMarkAutoTagger.autoTag(document, 4, tags)+"</small>");
+    					putProp(count, "meta");
    					count++;
+						prop.put("folders_"+count+"_foldername","<small><b>without preference: </b>"+YMarkAutoTagger.autoTag(document, 4, new  TreeMap<String, YMarkTag>())+"</small>");
+    					putProp(count, "meta");
+    					count++;        			
    	        		prop.put("folders", count);
 	        		} else if(isMetadata) {
 	        			count = putMeta(count, meta.loadMetadata());
--- a/htroot/js/Bookmarks.js
+++ b/htroot/js/Bookmarks.js
@ -24,7 +24,7 @@ function loadTitle(){
 	
 	url=document.getElementsByName("url")[0].value;
 	if(document.getElementsByName("title")[0].value==""){
-		sndReq('/api/util/getpageinfo_p.xml?actions=title&url='+url);
+		sndReq('/api/getpageinfo_p.xml?actions=title&url='+url);
 	}
 }

--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -75,5 +75,7 @@ function loadInfos() {
 	
 	url=document.getElementById("crawlingURL").value;
 	if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false;
-	sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url);
+	sndReq('/api/getpageinfo_p.xml?actions=title,robots&url='+url);
+	document.getElementById("api").innerHTML = "<a href='http://localhost:8090/api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='/env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>";
+	
 }
--- a/htroot/yacy/ui/yacyui-bookmarks.html
+++ b/htroot/yacy/ui/yacyui-bookmarks.html
@ -85,7 +85,7 @@
 				var url = $("input[name='bm_url']").getValue();
 				$.ajax({
 					type: "GET",
-					url: "/api/util/getpageinfo_p.xml?url="+url,			
+					url: "/api/getpageinfo_p.xml?url="+url,			
 					dataType: "xml",
 					success: function(xml) {
 						var title = $(xml).find('title').text();
--- a/htroot/yacy/ui/yacyui-search.html
+++ b/htroot/yacy/ui/yacyui-search.html
@ -162,7 +162,7 @@
 	function getTags(url, i) {
 		$.ajax({
 			type: "GET",
-			url: "/api/util/getpageinfo_p.xml?url="+url,			
+			url: "/api/getpageinfo_p.xml?url="+url,			
 			dataType: "xml",
 			success: function(xml) {					
 				tags = "";
--- a/source/de/anomic/crawler/RobotsTxtEntry.java
+++ b/source/de/anomic/crawler/RobotsTxtEntry.java
@ -1,4 +1,4 @@
-//RobotsEntry.java 
+//RobotsEntry.java
 //-------------------------------------
 //part of YACY
 //(C) by Michael Peter Christen; mc@yacy.net
@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray;


 public class RobotsTxtEntry {
-    
+
    private static final String HOST_NAME          = "hostname";
    private static final String ALLOW_PATH_LIST    = "allow";
    private static final String DISALLOW_PATH_LIST = "disallow";
@ -54,16 +54,18 @@ public class RobotsTxtEntry {
    private static final String CRAWL_DELAY        = "crawlDelay";
    private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
    private static final String AGENT_NAME         = "agentname";
-    
+
    // this is a simple record structure that holds all properties of a single crawl start
    private final Map<String, byte[]> mem;
    private final List<String> allowPathList, denyPathList;
    private final String hostName, agentName;
-    
+    private String info; // this is filled if robots disallowed access; then the reason is noted there;
+
    protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
        this.hostName = hostName.toLowerCase();
-        this.mem = mem; 
-        
+        this.mem = mem;
+        this.info = "";
+
        if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
            this.denyPathList = new LinkedList<String>();
            final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST));
@ -89,12 +91,12 @@ public class RobotsTxtEntry {
            this.allowPathList = new LinkedList<String>();
        }
        this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
-    }  
-    
+    }
+
    protected RobotsTxtEntry(
-            final MultiProtocolURI theURL, 
-            final List<String> allowPathList, 
-            final List<String> disallowPathList, 
+            final MultiProtocolURI theURL,
+            final List<String> allowPathList,
+            final List<String> disallowPathList,
            final Date loadedDate,
            final Date modDate,
            final String eTag,
@ -103,12 +105,12 @@ public class RobotsTxtEntry {
            final String agentName
    ) {
        if (theURL == null) throw new IllegalArgumentException("The url is missing");
-        
+
        this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
        this.allowPathList = new LinkedList<String>();
        this.denyPathList = new LinkedList<String>();
        this.agentName = agentName;
-        
+
        this.mem = new LinkedHashMap<String, byte[]>(10);
        this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
        if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime())));
@ -117,92 +119,92 @@ public class RobotsTxtEntry {
        if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap));
        if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis)));
        if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName));
-        
+
        if (allowPathList != null && !allowPathList.isEmpty()) {
            this.allowPathList.addAll(allowPathList);
-            
+
            final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
-            for (String element : allowPathList) {
+            for (final String element : allowPathList) {
                pathListStr.append(element)
                           .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
            }
            this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1)));
        }
-        
+
        if (disallowPathList != null && !disallowPathList.isEmpty()) {
            this.denyPathList.addAll(disallowPathList);
-            
+
            final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
-            for (String element : disallowPathList) {
+            for (final String element : disallowPathList) {
                pathListStr.append(element)
                           .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
            }
            this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1)));
        }
    }
-    
+
    protected String getHostName() {
        return this.hostName;
    }
-    
+
    protected String getAgentName() {
        return this.agentName;
    }
-    
+
    protected Map<String, byte[]> getMem() {
        if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
        return this.mem;
    }
-    
+
    @Override
    public String toString() {
        final StringBuilder str = new StringBuilder(6000);
        str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
        if (this.mem != null) str.append(this.mem.toString());
        return str.toString();
-    }    
-    
+    }
+
    /**
     * get the sitemap url
     * @return the sitemap url or null if no sitemap url is given
     */
    public MultiProtocolURI getSitemap() {
-        String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
+        final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
        if (url == null) return null;
        try {
            return new MultiProtocolURI(url);
-        } catch (MalformedURLException e) {
+        } catch (final MalformedURLException e) {
            return null;
        }
    }
-    
+
    protected Date getLoadedDate() {
        if (this.mem.containsKey(LOADED_DATE)) {
            return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
        }
        return null;
    }
-    
+
    protected void setLoadedDate(final Date newLoadedDate) {
        if (newLoadedDate != null) {
            this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
        }
    }
-    
+
    protected Date getModDate() {
        if (this.mem.containsKey(MOD_DATE)) {
            return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
        }
        return null;
-    }        
-    
+    }
+
    protected String getETag() {
        if (this.mem.containsKey(ETAG)) {
            return ASCII.String(this.mem.get(ETAG));
        }
        return null;
-    }          
-    
+    }
+
    protected long getCrawlDelayMillis() {
        if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
            return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
@ -214,26 +216,38 @@ public class RobotsTxtEntry {
        } catch (final NumberFormatException e) {
            return 0;
        }
-        return 0;           
+        return 0;
    }
-    
-    public boolean isDisallowed(MultiProtocolURI subpathURL) {
+
+    public boolean isDisallowed(final MultiProtocolURI subpathURL) {
        String path = subpathURL.getFile();
-        if ((this.mem == null) || (this.denyPathList.isEmpty())) return false;   
-        
+        if (this.mem == null) {
+            this.info = "no robots file available";
+            return false;
+        }
+        if (this.denyPathList.isEmpty()) {
+            this.info = "no entry in robots.txt";
+            return false;
+        }
+
        // if the path is null or empty we set it to /
-        if ((path == null) || (path.length() == 0)) path = "/";            
+        if (path == null || path.length() == 0) path = "/";
        // escaping all occurences of ; because this char is used as special char in the Robots DB
        else  path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
-        
-        for (String element : this.denyPathList) {
-                
+
+        for (final String element : this.denyPathList) {
+
            // disallow rule
            if (path.startsWith(element)) {
+                this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
                return true;
            }
        }
+        this.info = "path '" + path + "' does not start with any element from deny path list";
        return false;
    }

+    public String getInfo() {
+        return this.info;
+    }
 }
--- a/source/de/anomic/data/ymark/TablesRowComparator.java
+++ b/source/de/anomic/data/ymark/TablesRowComparator.java
@ -6,9 +6,14 @@ import net.yacy.kelondro.blob.Tables;

 public class TablesRowComparator implements Comparator<Tables.Row> {
    private String sortname;
+    private boolean desc;
    
-    public TablesRowComparator(final String sortname) {
+    public TablesRowComparator(final String sortname, final String sortorder) {
        setSortName(sortname);
+        if(sortorder.equals("desc"))
+        	this.desc = true;
+    	else
+    		this.desc = false;
    }
    
    public void setSortName(final String sortname) {
@ -20,7 +25,10 @@ public class TablesRowComparator implements Comparator<Tables.Row> {
            if(row0.containsKey(this.sortname) && row1.containsKey(this.sortname)) {
               String name1 = UTF8.String(row0.get(this.sortname)).toLowerCase();
               String name2 = UTF8.String(row1.get(this.sortname)).toLowerCase();
-               return name1.compareTo(name2);
+               if(desc)
+            	   return name2.compareTo(name1);
+               else
+            	   return name1.compareTo(name2);
            }
        }
        return 0;
--- a/source/de/anomic/data/ymark/YMarkAutoTagger.java
+++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java
@ -29,7 +29,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle

 	public final static String SPACE = " ";
 	public final static String POISON = "";
-	public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo"));
+	public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo", 
+			"and", "with", "the", "gt", "lt"));


 	private final ArrayBlockingQueue<String> bmkQueue;
@ -90,35 +91,40 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 			// generate potential tags from document title, description and subject
 			final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
 			final StringBuilder buffer = new StringBuilder(bufferSize);
+			final StringBuilder pwords = new StringBuilder(1000);
 			buffer.append(document.dc_title().toLowerCase());
 			buffer.append(document.dc_description().toLowerCase());
 			buffer.append(document.dc_subject(' ').toLowerCase());
 			final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
-			int count = 0;
+			int score = 0;
 			
 			// get phrases
 			final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
 			phrases.putAll(getPhrases(document, 3));
-			phrases.putAll(getPhrases(document, 4));
 			final Iterator<String> iter = phrases.keySet().iterator();
 			while(iter.hasNext()) {
-				count = 10;
+				score = 10;
 				final String phrase = iter.next();							
 				if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
-					count = phrases.get(phrase).size() * phrase.split(" ").length * 35;
+					score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
 				}
 				if(isDigitSpace(phrase)) {
-					count = 10;
+					score = 10;
 				}
 				if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {					
-					count = count * 10;
-				}	
-				topwords.add(new YMarkTag(phrase, count));
+					score = score * 10;
+				}
+				if (tags.containsKey(phrase)) {
+					score = score * 20;
+				}
+				topwords.add(new YMarkTag(phrase, score));
+				pwords.append(phrase);
+				pwords.append(' ');
 			}
-						
+			
 			// loop through potential tag and rank them
 			while(tokens.hasMoreElements()) {				
-				count = 0;
+				score = 0;
 				token = tokens.nextElement();
 				
 				// check if the token appears in the text
@ -126,23 +132,27 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 					final Word word = words.get(token.toString());
 					// token appears in text and matches an existing bookmark tag
 					if (tags.containsKey(token.toString())) {
-						count = word.occurrences() * tags.get(token.toString()).size() * 200;
+						score = word.occurrences() * tags.get(token.toString()).size() * 200;
 					}
 					// token appears in text and has more than 3 characters
 					else if (token.length()>3) {
-						count = word.occurrences() * 100;
+						score = word.occurrences() * 100;
+					}
+					// if token is already part of a phrase, reduce score
+					if(pwords.toString().indexOf(token.toString())>1) {
+						score = score / 3;
 					}
-					topwords.add(new YMarkTag(token.toString(), count));
+					topwords.add(new YMarkTag(token.toString(), score));
 				}
 			}
-			count = 0;
+			score = 0;
 			buffer.setLength(0);
 			for(final YMarkTag tag : topwords) {
-				if(count < max) {
+				if(score < max) {
 					if(tag.size() > 100) {
 						buffer.append(tag.name());
 						buffer.append(YMarkUtil.TAGS_SEPARATOR);
-						count++;
+						score++;
 					}
 				} else {
 					break;
@ -165,7 +175,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 		while(tokens.hasMoreElements()) {				

 			token = tokens.nextElement();			
-			if(stopwords.contains(token.toString()))
+			if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
 				continue;			
 			
 			// if we have a full phrase, delete the first token
--- a/source/de/anomic/data/ymark/YMarkCrawlStart.java
+++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java
@ -52,11 +52,13 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 	public void load(final String url) {
 		try {
 			final StringBuilder buffer = new StringBuilder(500);
-			buffer.append("^.*crawlingURL=\\Q");
+			//buffer.append("^.*crawlingURL=\\Q");
+			buffer.append("^crawl start for \\Q");
 			buffer.append(url);
 			buffer.append("\\E?.*");
 			final Pattern pattern = Pattern.compile(buffer.toString());
-			final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
+			//final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
+			final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern);
 			Tables.Row row = null;
 			while(APIcalls.hasNext()) {
 				row = APIcalls.next();
--- a/source/de/anomic/data/ymark/YMarkMetadata.java
+++ b/source/de/anomic/data/ymark/YMarkMetadata.java
@ -29,19 +29,14 @@ package de.anomic.data.ymark;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.EnumMap;
-import java.util.Map;
-import java.util.TreeMap;

 import net.yacy.cora.date.ISO8601Formatter;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.Condenser;
 import net.yacy.document.Document;
-import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser.Failure;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.data.word.Word;
 import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.index.Segments;
 import de.anomic.crawler.retrieval.Response;
@ -141,18 +136,4 @@ public class YMarkMetadata {
 		}
 		return metadata;
 	}
-	
-	public TreeMap<String,Word> getWordCounts() {
-		if (this.document != null) {
-            return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());
-        }
-		return new TreeMap<String, Word>();
-	}
-
-	public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
-        final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
-        sorted_words.putAll(unsorted_words);
-        return sorted_words;
-    }
-
 }
--- a/source/de/anomic/data/ymark/YMarkTables.java
+++ b/source/de/anomic/data/ymark/YMarkTables.java
@ -27,9 +27,11 @@
 package de.anomic.data.ymark;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.SortedSet;
+import java.util.List;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
@ -214,17 +216,16 @@ public class YMarkTables {
    	return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p);
    }

-    public SortedSet<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
-        final TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
+    public List<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
+        final List<Row> sortList = new ArrayList<Row>();
        Row row;
        while (rowIterator.hasNext()) {
            row = rowIterator.next();
            if(row != null)
-                sortTree.add(row);
+                sortList.add(row);
        }
-        if(sortorder.equals("desc"))
-            return sortTree.descendingSet();
-        return sortTree;
+        Collections.sort(sortList, new TablesRowComparator(sortname, sortorder)); 
+        return sortList;
    }

    public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException {
--- a/source/de/anomic/data/ymark/YMarkWordCountComparator.java
+++ b/source/de/anomic/data/ymark/YMarkWordCountComparator.java
@ -1,53 +0,0 @@
-// YMarkWordCountComparator.java
-// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
-// first published 2010 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-package de.anomic.data.ymark;
-
-import java.util.Comparator;
-import java.util.Map;
-
-import net.yacy.kelondro.data.word.Word;
-
-public class YMarkWordCountComparator implements Comparator<String> {
-
-	private Map<String,Word> words;
-	
-	public YMarkWordCountComparator(final Map<String,Word> words) {
-		this.words = words;
-	}
-	
-	public int compare(final String k1, final String k2) {
-		final Word w1 = this.words.get(k1);
-		final Word w2 = this.words.get(k2);
-		
-        if(w1.occurrences() > w2.occurrences())
-            return 1;
-        else if(w1.occurrences() < w2.occurrences())
-            return -1;
-        else
-            return 0; 
-	}
-}