From b21b9df2d0cb0eeedae2c77c5df6d59403939145 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 12 Jan 2006 20:21:34 +0000 Subject: [PATCH] added section headlines generation to html parser can be viewed in cache control, but is not yet included to indexing git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1320 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 15 ++- .../htmlFilter/htmlFilterContentScraper.java | 91 ++++++++++++++----- .../anomic/plasma/parser/rss/rssParser.java | 2 +- source/de/anomic/plasma/plasmaParser.java | 7 +- 4 files changed, 85 insertions(+), 30 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 22166fc78..7a84721f9 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -115,7 +115,8 @@ public class CacheAdmin_p { final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); serverFileUtils.copy(file, os); final plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper); - info.append("HEADLINE:
").append(scraper.getHeadline()).append("
").append("
") + info.append("TITLE:
").append(scraper.getTitle()).append("
").append("
") + .append("SECTION HEADLINES:
").append(formatTitles(document.getSectionTitles())).append("
") .append("HREF:
").append(formatAnchor(document.getHyperlinks())).append("
") .append("MEDIA:
").append(formatAnchor(document.getMedialinks())).append("
") .append("EMAIL:
").append(formatAnchor(document.getEmaillinks())).append("
") @@ -187,10 +188,20 @@ public class CacheAdmin_p { return prop; } + private static String formatTitles(String[] titles) { + StringBuffer s = new StringBuffer(); + s.append(""); + return new String(s); + } + private static String formatHeader(httpHeader header) { final StringBuffer result = new StringBuffer(2048); if (header == null) { - result.append("- no header in header cache -"); + result.append("- no header in header cache -
"); } else { result.append(""); final Iterator iter = header.entrySet().iterator(); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 3f987fcf5..38d3eaa67 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -46,7 +46,9 @@ package de.anomic.htmlFilter; import java.net.MalformedURLException; import java.net.URL; import java.text.Collator; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; @@ -77,6 +79,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); linkTags1.add("h1"); + linkTags1.add("h2"); + linkTags1.add("h3"); + linkTags1.add("h4"); linkTags1.add("title"); } @@ -84,7 +89,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private HashMap anchors; private HashMap images; private String title; - private String headline; + //private String headline; + private List[] headlines; private serverByteBuffer content; private URL root; @@ -96,7 +102,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.anchors = new HashMap(); this.images = new HashMap(); this.title = ""; - this.headline = ""; + this.headlines = new ArrayList[4]; + for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); this.content = new serverByteBuffer(1024); } @@ -204,29 +211,61 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString()); - if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString(); - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString(); + String h; + if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + if (h.length() > 0) headlines[0].add(h); + } + if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + if (h.length() > 0) headlines[1].add(h); + } + if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + if (h.length() > 0) headlines[2].add(h); + } + if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + if (h.length() > 0) headlines[3].add(h); + } + if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); } - public String getHeadline() { - String hl = ""; + private static String cleanLine(String s) { + // may contain too many funny symbols + for (int i = 0; i < s.length(); i++) + if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1); - // extract headline from content - if (title.length() > 0) hl = title.trim(); - else if (headline.length() > 0) hl = headline.trim(); - else if (content.length() > 80) hl = new String(content.getBytes(), 0, 80).trim(); - else hl = content.trim().toString(); - - // clean the line: may contain too many funny symbols - for (int i = 0; i < hl.length(); i++) - if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1); - - // clean the line: remove double-spaces + // remove double-spaces int p; - while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1); + while ((p = s.indexOf(" ")) >= 0) s = s.substring(0, p) + s.substring(p + 1); + // we don't accept headlines that are too short + s = s.trim(); + if (s.length() < 4) s = ""; + // return result - return hl.trim(); + return s; + } + + public String getTitle() { + // construct a title string, even if the document has no title + // if there is one, return it + if (title.length() > 0) return title; + // othervise take any headline + for (int i = 0; i < 4; i++) { + if (headlines[i].size() > 0) return (String) headlines[i].get(0); + } + // extract headline from content + if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80)); + return cleanLine(content.trim().toString()); + } + + public String[] getHeadlines(int i) { + assert ((i >= 1) && (i <= 4)); + String[] s = new String[headlines[i - 1].size()]; + for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j); + return s; } public byte[] getText() { @@ -247,17 +286,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen anchors = null; images = null; title = null; - headline = null; + headlines = null; content = null; root = null; } public void print() { - System.out.println("TITLE :" + title); - System.out.println("HEADLINE:" + headline); - System.out.println("ANCHORS :" + anchors.toString()); - System.out.println("IMAGES :" + images.toString()); - System.out.println("TEXT :" + new String(content.getBytes())); + System.out.println("TITLE :" + title); + for (int i = 0; i < 4; i++) { + System.out.println("HEADLINE" + i + ":" + headlines[i].toString()); + } + System.out.println("ANCHORS :" + anchors.toString()); + System.out.println("IMAGES :" + images.toString()); + System.out.println("TEXT :" + new String(content.getBytes())); } /* diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 7840a844a..ed295e178 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -152,7 +152,7 @@ public class rssParser extends AbstractParser implements Parser { OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os); - String itemHeadline = scraper.getHeadline(); + String itemHeadline = scraper.getTitle(); if ((itemHeadline != null) && (itemHeadline.length() > 0)) { feedSections.add(itemHeadline); } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index aaa69573c..ac75c92f3 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -543,9 +543,12 @@ public final class plasmaParser { public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) { try { + String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; + int p = 0; + for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; plasmaParserDocument ppd = new plasmaParserDocument(new URL(htmlFilterContentScraper.urlNormalform(location)), - mimeType, null, null, scraper.getHeadline(), - null, null, + mimeType, null, null, scraper.getTitle(), + sections, null, scraper.getText(), scraper.getAnchors(), scraper.getImages()); //scraper.close(); return ppd;