From b21b9df2d0cb0eeedae2c77c5df6d59403939145 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 12 Jan 2006 20:21:34 +0000
Subject: [PATCH] added section headlines generation to html parser can be
 viewed in cache control, but is not yet included to indexing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1320 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CacheAdmin_p.java                      | 15 ++-
 .../htmlFilter/htmlFilterContentScraper.java  | 91 ++++++++++++++-----
 .../anomic/plasma/parser/rss/rssParser.java   |  2 +-
 source/de/anomic/plasma/plasmaParser.java     |  7 +-
 4 files changed, 85 insertions(+), 30 deletions(-)
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 22166fc78..7a84721f9 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -115,7 +115,8 @@ public class CacheAdmin_p {
                     final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
                     serverFileUtils.copy(file, os);
                     final plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
-                    info.append("<b>HEADLINE:</b><br>").append(scraper.getHeadline()).append("<br>").append("<br>")
+                    info.append("<b>TITLE:</b><br>").append(scraper.getTitle()).append("<br>").append("<br>")
+                        .append("<b>SECTION HEADLINES:</b><br>").append(formatTitles(document.getSectionTitles())).append("<br>")
                         .append("<b>HREF:</b><br>").append(formatAnchor(document.getHyperlinks())).append("<br>")
                         .append("<b>MEDIA:</b><br>").append(formatAnchor(document.getMedialinks())).append("<br>")
                         .append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
@@ -187,10 +188,20 @@ public class CacheAdmin_p {
         return prop;
     }
 
+    private static String formatTitles(String[] titles) {
+        StringBuffer s = new StringBuffer();
+        s.append("<ul>");
+        for (int i = 0; i < titles.length; i++) {
+            s.append("<li>").append(titles[i]).append("</li>");
+        }
+        s.append("</ul>");
+        return new String(s);
+    }
+    
     private static String formatHeader(httpHeader header) {
         final StringBuffer result = new StringBuffer(2048);
         if (header == null) {
-            result.append("- no header in header cache -");
+            result.append("- no header in header cache -<br>");
         } else {
             result.append("<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">");
             final Iterator iter = header.entrySet().iterator();
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 3f987fcf5..38d3eaa67 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -46,7 +46,9 @@ package de.anomic.htmlFilter;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.Collator;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
@@ -77,6 +79,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         linkTags1 = new TreeSet(insensitiveCollator);
         linkTags1.add("a");
         linkTags1.add("h1");
+        linkTags1.add("h2");
+        linkTags1.add("h3");
+        linkTags1.add("h4");
         linkTags1.add("title");
     }
 
@@ -84,7 +89,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     private HashMap anchors;
     private HashMap images;
     private String title;
-    private String headline;
+    //private String headline;
+    private List[] headlines;
     private serverByteBuffer content;
     private URL root;
 
@@ -96,7 +102,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         this.anchors = new HashMap();
         this.images = new HashMap();
         this.title = "";
-        this.headline = "";
+        this.headlines = new ArrayList[4];
+        for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
         this.content = new serverByteBuffer(1024);
     }
 
@@ -204,29 +211,61 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
 //      System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
         if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
-        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
-        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();        
+        String h;
+        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
+            h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+            if (h.length() > 0) headlines[0].add(h);
+        }
+        if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
+            h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+            if (h.length() > 0) headlines[1].add(h);
+        }
+        if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
+            h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+            if (h.length() > 0) headlines[2].add(h);
+        }
+        if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
+            h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
+            if (h.length() > 0) headlines[3].add(h);
+        }
+        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());        
     }
 
-    public String getHeadline() {
-        String hl = "";
+    private static String cleanLine(String s) {
+        // may contain too many funny symbols
+        for (int i = 0; i < s.length(); i++)
+            if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1);
 
-        // extract headline from content
-        if (title.length() > 0) hl = title.trim();
-        else if (headline.length() > 0) hl = headline.trim();
-        else if (content.length() > 80) hl = new String(content.getBytes(), 0, 80).trim();
-        else hl = content.trim().toString();
-
-        // clean the line: may contain too many funny symbols
-        for (int i = 0; i < hl.length(); i++)
-            if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
-
-        // clean the line: remove double-spaces
+        // remove double-spaces
         int p;
-        while ((p = hl.indexOf("  ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);        
+        while ((p = s.indexOf("  ")) >= 0) s = s.substring(0, p) + s.substring(p + 1);        
 
+        // we don't accept headlines that are too short
+        s = s.trim();
+        if (s.length() < 4) s = "";
+        
         // return result
-        return hl.trim();
+        return s;
+    }
+    
+    public String getTitle() {
+        // construct a title string, even if the document has no title
+        // if there is one, return it
+        if (title.length() > 0) return title;
+        // othervise take any headline
+        for (int i = 0; i < 4; i++) {
+            if (headlines[i].size() > 0) return (String) headlines[i].get(0);
+        }
+        // extract headline from content
+        if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
+        return cleanLine(content.trim().toString());
+    }
+    
+    public String[] getHeadlines(int i) {
+        assert ((i >= 1) && (i <= 4));
+        String[] s = new String[headlines[i - 1].size()];
+        for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j);
+        return s;
     }
 
     public byte[] getText() {
@@ -247,17 +286,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         anchors = null;
         images = null;
         title = null;
-        headline = null;
+        headlines = null;
         content = null;
         root = null;
     }
 
     public void print() {
-    System.out.println("TITLE   :" + title);
-    System.out.println("HEADLINE:" + headline);
-    System.out.println("ANCHORS :" + anchors.toString());
-    System.out.println("IMAGES  :" + images.toString());
-    System.out.println("TEXT    :" + new String(content.getBytes()));
+        System.out.println("TITLE    :" + title);
+        for (int i = 0; i < 4; i++) {
+            System.out.println("HEADLINE" + i + ":" + headlines[i].toString());
+        }
+        System.out.println("ANCHORS  :" + anchors.toString());
+        System.out.println("IMAGES   :" + images.toString());
+        System.out.println("TEXT     :" + new String(content.getBytes()));
     }
 
 /*
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 7840a844a..ed295e178 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -152,7 +152,7 @@ public class rssParser extends AbstractParser implements Parser {
                         OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
                         serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os);
                         
-                        String itemHeadline = scraper.getHeadline();     
+                        String itemHeadline = scraper.getTitle();     
                         if ((itemHeadline != null) && (itemHeadline.length() > 0)) {
                             feedSections.add(itemHeadline);
                         }
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index aaa69573c..ac75c92f3 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -543,9 +543,12 @@ public final class plasmaParser {
     
     public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
         try {
+            String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
+            int p = 0;
+            for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
             plasmaParserDocument ppd =  new plasmaParserDocument(new URL(htmlFilterContentScraper.urlNormalform(location)),
-                                mimeType, null, null, scraper.getHeadline(),
-                                null, null,
+                                mimeType, null, null, scraper.getTitle(),
+                                sections, null,
                                 scraper.getText(), scraper.getAnchors(), scraper.getImages());
             //scraper.close();
             return ppd;