small fixes to getpageinfo_p.xml and htmlFilterContentScraper.java with respect to keyword extraction

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5185 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 5e8bd0f29c
parent 029e16b653
commit 5e8bd0f29c
2 changed files with 13 additions and 5 deletions
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@ -46,11 +46,18 @@ public class getpageinfo_p {
    public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
        final plasmaSwitchboard sb = (plasmaSwitchboard) env;
        final serverObjects prop = new serverObjects();
-        prop.put("sitemap", "");
-        prop.put("title", "");
-        prop.put("favicon","");
+        
+        // avoid UNRESOLVED PATTERN        
+        prop.put("title", "");        
+        prop.put("desc", "");
+        prop.put("lang", "");
        prop.put("robots-allowed", "3"); //unknown
-        String actions="title";
+        prop.put("sitemap", "");
+        prop.put("favicon","");        
+        
+        // default actions
+        String actions="title,robots";
+        
        if(post!=null && post.containsKey("url")){
            if(post.containsKey("actions"))
                actions=post.get("actions");
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -395,7 +395,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        if (s.length() == 0) {
            return getTitle().toLowerCase().split(splitrex);
        }
-        if (s.contains(",")) return s.split(",");
+        if (s.contains(",")) return s.split(" |,");
+        if (s.contains(";")) return s.split(" |;");
        return s.split("\\s");
    }