- /xml/util/getpageinfo_p.xml added <desc> and <lang> tags

- changed htmlFilterContentScraper.getKeywords() to split either space or comma charater not both

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5183 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 17 years ago
parent e1f67262f7
commit 5b2a57bfd0

@ -88,16 +88,16 @@ public class getpageinfo_p {
int count = 0;
for(int i=0;i<list.length;i++){
String tag = list[i];
if (!tag.equals("")) {
while (i<(list.length-1) && !list[i+1].equals("")) {
i++;
tag += " "+list[i];
}
if (!tag.equals("")) {
prop.putHTML("tags_"+count+"_tag", tag, true);
count++;
}
}
prop.put("tags", count);
// put description
prop.putHTML("desc", scraper.getDescription(), true);
// put language
prop.putHTML("lang", scraper.getContentLanguages()[0], true);
} catch (final MalformedURLException e) { /* ignore this */
} catch (final IOException e) { /* ignore this */
@ -106,7 +106,7 @@ public class getpageinfo_p {
if(actions.indexOf("robots")>=0){
try {
final yacyURL theURL = new yacyURL(url, null);
// determine if crawling of the current URL is allowed
prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1");

@ -1,6 +1,8 @@
<?xml version='1.0' standalone='yes'?>
<pageinfo>
<title>#[title]#</title>
<desc>#[desc]#</desc>
<lang>#[lang]#</lang>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon>
@ -9,4 +11,4 @@
<tag name="#[tag]#" />
#{/tags}#
</tags>
</pageinfo>
</pageinfo>

@ -395,7 +395,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (s.length() == 0) {
return getTitle().toLowerCase().split(splitrex);
}
return s.split(" |,");
if (s.contains(",")) return s.split(",");
return s.split("\\s");
}
public int getRefreshSeconds() {

Loading…
Cancel
Save