diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 22166fc78..7a84721f9 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -115,7 +115,8 @@ public class CacheAdmin_p {
final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(file, os);
final plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
- info.append("HEADLINE:
").append(scraper.getHeadline()).append("
").append("
")
+ info.append("TITLE:
").append(scraper.getTitle()).append("
").append("
")
+ .append("SECTION HEADLINES:
").append(formatTitles(document.getSectionTitles())).append("
")
.append("HREF:
").append(formatAnchor(document.getHyperlinks())).append("
")
.append("MEDIA:
").append(formatAnchor(document.getMedialinks())).append("
")
.append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
")
@@ -187,10 +188,20 @@ public class CacheAdmin_p {
return prop;
}
+ private static String formatTitles(String[] titles) {
+ StringBuffer s = new StringBuffer();
+ s.append("
");
+ for (int i = 0; i < titles.length; i++) {
+ s.append("- ").append(titles[i]).append("
");
+ }
+ s.append("
");
+ return new String(s);
+ }
+
private static String formatHeader(httpHeader header) {
final StringBuffer result = new StringBuffer(2048);
if (header == null) {
- result.append("- no header in header cache -");
+ result.append("- no header in header cache -
");
} else {
result.append("");
final Iterator iter = header.entrySet().iterator();
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 3f987fcf5..38d3eaa67 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -46,7 +46,9 @@ package de.anomic.htmlFilter;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.Collator;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
@@ -77,6 +79,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
linkTags1.add("h1");
+ linkTags1.add("h2");
+ linkTags1.add("h3");
+ linkTags1.add("h4");
linkTags1.add("title");
}
@@ -84,7 +89,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private HashMap anchors;
private HashMap images;
private String title;
- private String headline;
+ //private String headline;
+ private List[] headlines;
private serverByteBuffer content;
private URL root;
@@ -96,7 +102,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.anchors = new HashMap();
this.images = new HashMap();
this.title = "";
- this.headline = "";
+ this.headlines = new ArrayList[4];
+ for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
this.content = new serverByteBuffer(1024);
}
@@ -204,29 +211,61 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
- if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
- if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
+ String h;
+ if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
+ h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+ if (h.length() > 0) headlines[0].add(h);
+ }
+ if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
+ h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+ if (h.length() > 0) headlines[1].add(h);
+ }
+ if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
+ h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+ if (h.length() > 0) headlines[2].add(h);
+ }
+ if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
+ h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+ if (h.length() > 0) headlines[3].add(h);
+ }
+ if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
}
- public String getHeadline() {
- String hl = "";
+ private static String cleanLine(String s) {
+ // may contain too many funny symbols
+ for (int i = 0; i < s.length(); i++)
+ if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1);
- // extract headline from content
- if (title.length() > 0) hl = title.trim();
- else if (headline.length() > 0) hl = headline.trim();
- else if (content.length() > 80) hl = new String(content.getBytes(), 0, 80).trim();
- else hl = content.trim().toString();
-
- // clean the line: may contain too many funny symbols
- for (int i = 0; i < hl.length(); i++)
- if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
-
- // clean the line: remove double-spaces
+ // remove double-spaces
int p;
- while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);
+ while ((p = s.indexOf(" ")) >= 0) s = s.substring(0, p) + s.substring(p + 1);
+ // we don't accept headlines that are too short
+ s = s.trim();
+ if (s.length() < 4) s = "";
+
// return result
- return hl.trim();
+ return s;
+ }
+
+ public String getTitle() {
+ // construct a title string, even if the document has no title
+ // if there is one, return it
+ if (title.length() > 0) return title;
+ // othervise take any headline
+ for (int i = 0; i < 4; i++) {
+ if (headlines[i].size() > 0) return (String) headlines[i].get(0);
+ }
+ // extract headline from content
+ if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
+ return cleanLine(content.trim().toString());
+ }
+
+ public String[] getHeadlines(int i) {
+ assert ((i >= 1) && (i <= 4));
+ String[] s = new String[headlines[i - 1].size()];
+ for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j);
+ return s;
}
public byte[] getText() {
@@ -247,17 +286,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
anchors = null;
images = null;
title = null;
- headline = null;
+ headlines = null;
content = null;
root = null;
}
public void print() {
- System.out.println("TITLE :" + title);
- System.out.println("HEADLINE:" + headline);
- System.out.println("ANCHORS :" + anchors.toString());
- System.out.println("IMAGES :" + images.toString());
- System.out.println("TEXT :" + new String(content.getBytes()));
+ System.out.println("TITLE :" + title);
+ for (int i = 0; i < 4; i++) {
+ System.out.println("HEADLINE" + i + ":" + headlines[i].toString());
+ }
+ System.out.println("ANCHORS :" + anchors.toString());
+ System.out.println("IMAGES :" + images.toString());
+ System.out.println("TEXT :" + new String(content.getBytes()));
}
/*
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 7840a844a..ed295e178 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -152,7 +152,7 @@ public class rssParser extends AbstractParser implements Parser {
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os);
- String itemHeadline = scraper.getHeadline();
+ String itemHeadline = scraper.getTitle();
if ((itemHeadline != null) && (itemHeadline.length() > 0)) {
feedSections.add(itemHeadline);
}
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index aaa69573c..ac75c92f3 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -543,9 +543,12 @@ public final class plasmaParser {
public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {
+ String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
+ int p = 0;
+ for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(htmlFilterContentScraper.urlNormalform(location)),
- mimeType, null, null, scraper.getHeadline(),
- null, null,
+ mimeType, null, null, scraper.getTitle(),
+ sections, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
//scraper.close();
return ppd;