prepared generic text parser environment

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@15 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · a87a17a3c8
parent e374aca2cd
commit a87a17a3c8
4 changed files with 360 additions and 19 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -149,13 +149,15 @@ public class CacheAdmin_p {
        return out;
    }
    
-    private static String formatAnchor(Properties a) {
+    private static String formatAnchor(Map a) {
        String out = "<table border=\"0\" cellspacing=\"0\" cellpadding=\"0\">";
-        Enumeration e = a.keys();
+        Iterator i = a.entrySet().iterator();
        String url, descr;
-        while (e.hasMoreElements()) {
-            url = (String) e.nextElement();
-            descr = a.getProperty(url).trim();
+        Map.Entry entry;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            url = (String) entry.getKey();
+            descr = ((String) entry.getValue()).trim();
            if (descr.length() == 0) descr = "-";
            out += "<tr valign=\"top\"><td><span class=\"small\">" + descr + "&nbsp;</span></td><td class=\"tt\">" + url + "</td></tr>";            
        }
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -161,25 +161,70 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	return image;
    }

-    public Properties getHyperlinks() {
+    public Map getHyperlinks() {
 	if (hyperlinks == null) resortLinks();
 	return hyperlinks;
    }

-    public Properties getMedialinks() {
+    public Map getMedialinks() {
 	if (medialinks == null) resortLinks();
 	return medialinks;
    }

-    public Properties getEmaillinks() {
+    public Map getEmaillinks() {
 	if (emaillinks == null) resortLinks();
 	return emaillinks;
    }

-    Properties hyperlinks = null;
-    Properties medialinks = null;
-    Properties emaillinks = null;
-
+    HashMap hyperlinks = null;
+    HashMap medialinks = null;
+    HashMap emaillinks = null;
+
+            private synchronized void resortLinks() {
+            Iterator i;
+            String url;
+            int extpos;
+            String ext;
+            i = anchor.entrySet().iterator();
+            hyperlinks = new HashMap();
+            medialinks = new HashMap();
+            emaillinks = new HashMap();
+            Map.Entry entry;
+            while (i.hasNext()) {
+                entry = (Map.Entry) i.next();
+                url = (String) entry.getKey();
+                if ((url != null) && (url.startsWith("mailto:"))) {
+                    emaillinks.put(url.substring(7), entry.getValue());
+                } else {
+                    extpos = url.lastIndexOf(".");
+                    String normal;
+                    if (extpos > 0) {
+                        ext = url.substring(extpos).toLowerCase();
+                        normal = urlNormalform(url);
+                        if (normal != null) {
+                            if (mediaExt.indexOf(ext.substring(1)) >= 0) {
+                                // this is not an normal anchor, its a media link
+                                medialinks.put(normal, entry.getValue());
+                            } else {
+                                hyperlinks.put(normal, entry.getValue());
+                            }
+                        }
+                    }
+                }
+            }
+            // finally add the images to the medialinks
+            i = image.entrySet().iterator();
+            String normal;
+            while (i.hasNext()) {
+                entry = (Map.Entry) i.next();
+                url = (String) entry.getKey();
+                normal = urlNormalform(url);
+                if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+            }
+            expandHyperlinks();
+        }
+        
+            /*
    private synchronized void resortLinks() {
 	Enumeration e;
 	String url;
@ -219,7 +264,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	    if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException
 	}
    }
-
+*/

    public synchronized void expandHyperlinks() {
 	// we add artificial hyperlinks to the hyperlink set that can be calculated from
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -0,0 +1,292 @@
+// plasmaParser.java 
+// ------------------------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2005
+// last major change: 12.04.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+package de.anomic.plasma;
+
+import de.anomic.htmlFilter.*;
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+public class plasmaParser {
+    
+    public static String mediaExt =
+        "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
+        "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
+    
+
+    public plasmaParser(File parserDispatcherPropertyFile) {
+        // this is only a dummy yet because we have only one parser...
+        
+    }
+    
+    public document parse(URL location, String mimeType, byte[] source) {
+        // make a scraper and transformer
+        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+        try {
+            hfos.write(source);
+            return new document(new URL(urlNormalform(location)),
+            mimeType, null, null, scraper.getHeadline(),
+            null, null,
+            scraper.getText(), scraper.getAnchor(), scraper.getImage());
+        } catch (Exception e) {
+            return null;
+        }
+    }
+    
+    public static String urlNormalform(URL url) {
+        if (url == null) return null;
+        return urlNormalform(url.toString());
+    }
+    
+    public static String urlNormalform(String us) {
+        if (us == null) return null;
+        if (us.length() == 0) return null;
+        int p;
+        if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
+        if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
+        if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
+        return us;
+    }   
+    
+    
+    
+    public class document {
+        
+        URL location;       // the source url
+        String mimeType;    // mimeType as taken from http header
+        String keywords;    // most resources provide a keyword field
+        String shortTitle;  // a shortTitle mostly appears in the window header (border)
+        String longTitle;   // the real title of the document, commonly h1-tags
+        String[] sections;  // if present: more titles/headlines appearing in the document
+        String abstrct;     // an abstract, if present: short content description
+        byte[] text;        // the clear text, all that is visible
+        Map anchors;        // all links embedded as clickeable entities (anchor tags)
+        Map images;         // all visible pictures in document
+        // the anchors and images - Maps are URL-to-EntityDescription mappings.
+        // The EntityDescription appear either as visible text in anchors or as alternative
+        // text in image tags.
+        Map hyperlinks;
+        Map medialinks;
+        Map emaillinks;
+                        
+        public document(URL location, String mimeType,
+                        String keywords, String shortTitle, String longTitle,
+                        String[] sections, String abstrct,
+                        byte[] text, Map anchors, Map images) {
+            this.location = location;
+            this.mimeType = mimeType;
+            this.keywords = keywords;
+            this.shortTitle = shortTitle;
+            this.longTitle = longTitle;
+            this.sections = sections;
+            this.abstrct = abstrct;
+            this.text = text;
+            this.anchors = anchors;
+            this.images = images;
+            this.hyperlinks = null;
+            this.medialinks = null;
+            this.emaillinks = null;
+        }
+        
+        private String absolutePath(String relativePath) {
+            try {
+                return urlNormalform(new URL(location, relativePath));
+            } catch (Exception e) {
+                return "";
+            }
+        }
+        
+        public String getMainShortTitle() {
+            if (shortTitle != null) return shortTitle; else return longTitle;
+        }
+        
+        public String getMainLongTitle() {
+            if (longTitle != null) return longTitle; else return shortTitle;
+        }
+        
+        public String[] getSectionTitles() {
+            if (sections != null) return sections; else return new String[]{getMainLongTitle()};
+        }
+
+        public String getAbstract() {
+            if (abstrct != null) return abstrct; else return getMainLongTitle();
+        }
+        
+        public byte[] getText() {
+            // returns only the clear (visible) text (not the source data)
+            return text;
+        }
+        
+        public Map getAnchors() {
+            // returns all links embedded as anchors (clickeable entities)
+            return anchors;
+        }
+        
+        public Map getImages() {
+            // returns all links enbedded as pictures (visible iin document)
+            return images;
+        }
+        
+        // the next three methods provide a calculated view on the getAnchors/getImages:
+        
+        public Map getHyperlinks() {
+            // this is a subset of the getAnchor-set: only links to other hyperrefs
+            if (hyperlinks == null) resortLinks();
+            return hyperlinks;
+        }
+        
+        public Map getMedialinks() {
+            // this is partly subset of getAnchor and getImage: all non-hyperrefs
+            if (medialinks == null) resortLinks();
+            return medialinks;
+        }
+        
+        public Map getEmaillinks() {
+            // this is part of the getAnchor-set: only links to email addresses
+            if (emaillinks == null) resortLinks();
+            return emaillinks;
+        }
+        
+        private synchronized void resortLinks() {
+            Iterator i;
+            String url;
+            int extpos;
+            String ext;
+            i = anchors.entrySet().iterator();
+            hyperlinks = new HashMap();
+            medialinks = new HashMap();
+            emaillinks = new HashMap();
+            Map.Entry entry;
+            while (i.hasNext()) {
+                entry = (Map.Entry) i.next();
+                url = (String) entry.getKey();
+                if ((url != null) && (url.startsWith("mailto:"))) {
+                    emaillinks.put(url.substring(7), entry.getValue());
+                } else {
+                    extpos = url.lastIndexOf(".");
+                    String normal;
+                    if (extpos > 0) {
+                        ext = url.substring(extpos).toLowerCase();
+                        normal = urlNormalform(url);
+                        if (normal != null) {
+                            if (mediaExt.indexOf(ext.substring(1)) >= 0) {
+                                // this is not an normal anchor, its a media link
+                                medialinks.put(normal, entry.getValue());
+                            } else {
+                                hyperlinks.put(normal, entry.getValue());
+                            }
+                        }
+                    }
+                }
+            }
+            // finally add the images to the medialinks
+            i = images.entrySet().iterator();
+            String normal;
+            while (i.hasNext()) {
+                entry = (Map.Entry) i.next();
+                url = (String) entry.getKey();
+                normal = urlNormalform(url);
+                if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+            }
+            expandHyperlinks();
+        }
+        
+        
+        public synchronized void expandHyperlinks() {
+            // we add artificial hyperlinks to the hyperlink set that can be calculated from
+            // given hyperlinks and imagelinks
+            hyperlinks.putAll(allReflinks(hyperlinks));
+            hyperlinks.putAll(allReflinks(medialinks));
+            hyperlinks.putAll(allSubpaths(hyperlinks));
+            hyperlinks.putAll(allSubpaths(medialinks));
+        }
+        
+    }
+    
+    private static Map allReflinks(Map links) {
+        // we find all links that are part of a reference inside a url
+        HashMap v = new HashMap();
+        Iterator i = links.keySet().iterator();
+        String s;
+        int pos;
+        loop: while (i.hasNext()) {
+            s = (String) i.next();
+            if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
+                i.remove();
+                s = s.substring(pos);
+                while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
+                if (!(v.containsKey(s))) v.put(s, "ref");
+                continue loop;
+            }
+            if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
+                i.remove();
+                s = "http:/" + s.substring(pos);
+                while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
+                if (!(v.containsKey(s))) v.put(s, "ref");
+                continue loop;
+            }
+        }
+        return v;
+    }
+    
+    private static Map allSubpaths(Map links) {
+        HashMap v = new HashMap();
+        Iterator i = links.keySet().iterator();
+        String s;
+        int pos;
+        while (i.hasNext()) {
+            s = (String) i.next();
+            if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
+            pos = s.lastIndexOf("/");
+            while (pos > 8) {
+                s = s.substring(0, pos + 1);
+                if (!(v.containsKey(s))) v.put(s, "sub");
+                s = s.substring(0, pos);
+                pos = s.lastIndexOf("/");
+            }
+        }
+        return v;
+    }
+    
+}
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -482,19 +482,21 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
            // put anchors on crawl stack
            if (((processCase == 4) || (processCase == 5)) &&
                             (entry.depth < entry.profile.generalDepth())) {
-                Properties hl = entry.scraper.getHyperlinks();
-                Enumeration e = hl.propertyNames();
+                Map hl = entry.scraper.getHyperlinks();
+                Iterator i = hl.entrySet().iterator();
                String nexturlstring;
                String rejectReason;
                int c = 0;
-                while (e.hasMoreElements()) {
-                    nexturlstring = (String) e.nextElement();
-                    rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, hl.getProperty(nexturlstring), entry.lastModified, entry.depth + 1, entry.profile);
+                Map.Entry e;
+                while (i.hasNext()) {
+                    e = (Map.Entry) i.next();
+                    nexturlstring = (String) e.getKey();
+                    rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile);
                    if (rejectReason == null) {
                        c++;
                    } else {
                        errorURL.newEntry(new URL(nexturlstring), entry.urlString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
-				       hl.getProperty(nexturlstring), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
+				       (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
                    }
                }
                log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +