From 5b2a57bfd05801af984134aca27f38ec5e5df713 Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Thu, 18 Sep 2008 21:01:23 +0000 Subject: [PATCH] - /xml/util/getpageinfo_p.xml added and tags - changed htmlFilterContentScraper.getKeywords() to split either space or comma charater not both git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5183 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/xml/util/getpageinfo_p.java | 12 ++++++------ htroot/xml/util/getpageinfo_p.xml | 4 +++- .../anomic/htmlFilter/htmlFilterContentScraper.java | 3 ++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java index c27ca9acc..663d61f22 100644 --- a/htroot/xml/util/getpageinfo_p.java +++ b/htroot/xml/util/getpageinfo_p.java @@ -88,16 +88,16 @@ public class getpageinfo_p { int count = 0; for(int i=0;i=0){ try { final yacyURL theURL = new yacyURL(url, null); - + // determine if crawling of the current URL is allowed prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1"); diff --git a/htroot/xml/util/getpageinfo_p.xml b/htroot/xml/util/getpageinfo_p.xml index a89d94140..34d2cbb05 100644 --- a/htroot/xml/util/getpageinfo_p.xml +++ b/htroot/xml/util/getpageinfo_p.xml @@ -1,6 +1,8 @@ #[title]# + #[desc]# + #[lang]# #(robots-allowed)#0::1::#(/robots-allowed)# #[sitemap]# #[favicon]# @@ -9,4 +11,4 @@ #{/tags}# - \ No newline at end of file + diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 4feab2a3a..91e5d0964 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -395,7 +395,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (s.length() == 0) { return getTitle().toLowerCase().split(splitrex); } - return s.split(" |,"); + if (s.contains(",")) return s.split(","); + return s.split("\\s"); } public int getRefreshSeconds() {