From 5b2a57bfd05801af984134aca27f38ec5e5df713 Mon Sep 17 00:00:00 2001
From: apfelmaennchen <apfelmaennchen@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 18 Sep 2008 21:01:23 +0000
Subject: [PATCH] - /xml/util/getpageinfo_p.xml added <desc> and <lang> tags -
 changed htmlFilterContentScraper.getKeywords() to split either space or comma
 charater not both

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5183 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/xml/util/getpageinfo_p.java                   | 12 ++++++------
 htroot/xml/util/getpageinfo_p.xml                    |  4 +++-
 .../anomic/htmlFilter/htmlFilterContentScraper.java  |  3 ++-
 3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java
index c27ca9acc..663d61f22 100644
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@@ -88,16 +88,16 @@ public class getpageinfo_p {
                     int count = 0;
                     for(int i=0;i<list.length;i++){
                     	String tag = list[i];
-                    	if (!tag.equals("")) {
-                    		while (i<(list.length-1) && !list[i+1].equals("")) {
-                    			i++;
-                    			tag += " "+list[i];                    			
-                    		}                    	                 	
+                    	if (!tag.equals("")) {                   	                 	
                     		prop.putHTML("tags_"+count+"_tag", tag, true);
                     		count++;
                     	}
                     }
                     prop.put("tags", count);
+                    // put description                    
+                    prop.putHTML("desc", scraper.getDescription(), true);
+                    // put language 
+                    prop.putHTML("lang", scraper.getContentLanguages()[0], true);
 
                 } catch (final MalformedURLException e) { /* ignore this */
                 } catch (final IOException e) { /* ignore this */
@@ -106,7 +106,7 @@ public class getpageinfo_p {
             if(actions.indexOf("robots")>=0){
                 try {
                     final yacyURL theURL = new yacyURL(url, null);
-                	
+                    
                 	// determine if crawling of the current URL is allowed
                 	prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1");
                     
diff --git a/htroot/xml/util/getpageinfo_p.xml b/htroot/xml/util/getpageinfo_p.xml
index a89d94140..34d2cbb05 100644
--- a/htroot/xml/util/getpageinfo_p.xml
+++ b/htroot/xml/util/getpageinfo_p.xml
@@ -1,6 +1,8 @@
 <?xml version='1.0' standalone='yes'?>
 <pageinfo>
   <title>#[title]#</title>
+  <desc>#[desc]#</desc>
+  <lang>#[lang]#</lang>
   <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
   <sitemap>#[sitemap]#</sitemap>
   <favicon>#[favicon]#</favicon>
@@ -9,4 +11,4 @@
     <tag name="#[tag]#" />
     #{/tags}#
   </tags>
-</pageinfo>
\ No newline at end of file
+</pageinfo>
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 4feab2a3a..91e5d0964 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -395,7 +395,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         if (s.length() == 0) {
             return getTitle().toLowerCase().split(splitrex);
         }
-        return s.split(" |,");
+        if (s.contains(",")) return s.split(",");
+        return s.split("\\s");
     }
     
     public int getRefreshSeconds() {