- /xml/util/getpageinfo_p.xml added <desc> and <lang> tags

- changed htmlFilterContentScraper.getKeywords() to split either space or comma charater not both git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5183 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 5b2a57bfd0
parent e1f67262f7
commit 5b2a57bfd0
3 changed files with 11 additions and 8 deletions
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@ -88,16 +88,16 @@ public class getpageinfo_p {
                    int count = 0;
                    for(int i=0;i<list.length;i++){
                    	String tag = list[i];
-                    	if (!tag.equals("")) {
-                    		while (i<(list.length-1) && !list[i+1].equals("")) {
-                    			i++;
-                    			tag += " "+list[i];                    			
-                    		}                    	                 	
+                    	if (!tag.equals("")) {                   	                 	
                    		prop.putHTML("tags_"+count+"_tag", tag, true);
                    		count++;
                    	}
                    }
                    prop.put("tags", count);
+                    // put description                    
+                    prop.putHTML("desc", scraper.getDescription(), true);
+                    // put language 
+                    prop.putHTML("lang", scraper.getContentLanguages()[0], true);

                } catch (final MalformedURLException e) { /* ignore this */
                } catch (final IOException e) { /* ignore this */
@ -106,7 +106,7 @@ public class getpageinfo_p {
            if(actions.indexOf("robots")>=0){
                try {
                    final yacyURL theURL = new yacyURL(url, null);
-                	
+                    
                	// determine if crawling of the current URL is allowed
                	prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1");
                    
--- a/htroot/xml/util/getpageinfo_p.xml
+++ b/htroot/xml/util/getpageinfo_p.xml
@ -1,6 +1,8 @@
 <?xml version='1.0' standalone='yes'?>
 <pageinfo>
  <title>#[title]#</title>
+  <desc>#[desc]#</desc>
+  <lang>#[lang]#</lang>
  <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
  <sitemap>#[sitemap]#</sitemap>
  <favicon>#[favicon]#</favicon>
@ -9,4 +11,4 @@
    <tag name="#[tag]#" />
    #{/tags}#
  </tags>
-</pageinfo>
+</pageinfo>
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -395,7 +395,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        if (s.length() == 0) {
            return getTitle().toLowerCase().split(splitrex);
        }
-        return s.split(" |,");
+        if (s.contains(",")) return s.split(",");
+        return s.split("\\s");
    }
    
    public int getRefreshSeconds() {