From a87a17a3c8eca4eafd138f12fdc0695ab9d20e5f Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 12 Apr 2005 22:57:54 +0000 Subject: [PATCH] prepared generic text parser environment git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@15 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 12 +- .../htmlFilter/htmlFilterContentScraper.java | 61 +++- source/de/anomic/plasma/plasmaParser.java | 292 ++++++++++++++++++ .../de/anomic/plasma/plasmaSwitchboard.java | 14 +- 4 files changed, 360 insertions(+), 19 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaParser.java diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 0b18d59a3..6f23c5b77 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -149,13 +149,15 @@ public class CacheAdmin_p { return out; } - private static String formatAnchor(Properties a) { + private static String formatAnchor(Map a) { String out = ""; - Enumeration e = a.keys(); + Iterator i = a.entrySet().iterator(); String url, descr; - while (e.hasMoreElements()) { - url = (String) e.nextElement(); - descr = a.getProperty(url).trim(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + descr = ((String) entry.getValue()).trim(); if (descr.length() == 0) descr = "-"; out += ""; } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index e5e4f49f8..48ca7c8d7 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -161,25 +161,70 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return image; } - public Properties getHyperlinks() { + public Map getHyperlinks() { if (hyperlinks == null) resortLinks(); return hyperlinks; } - public Properties getMedialinks() { + public Map getMedialinks() { if (medialinks == null) resortLinks(); return medialinks; } - public Properties getEmaillinks() { + public Map getEmaillinks() { if (emaillinks == null) resortLinks(); return emaillinks; } - Properties hyperlinks = null; - Properties medialinks = null; - Properties emaillinks = null; - + HashMap hyperlinks = null; + HashMap medialinks = null; + HashMap emaillinks = null; + + private synchronized void resortLinks() { + Iterator i; + String url; + int extpos; + String ext; + i = anchor.entrySet().iterator(); + hyperlinks = new HashMap(); + medialinks = new HashMap(); + emaillinks = new HashMap(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + if ((url != null) && (url.startsWith("mailto:"))) { + emaillinks.put(url.substring(7), entry.getValue()); + } else { + extpos = url.lastIndexOf("."); + String normal; + if (extpos > 0) { + ext = url.substring(extpos).toLowerCase(); + normal = urlNormalform(url); + if (normal != null) { + if (mediaExt.indexOf(ext.substring(1)) >= 0) { + // this is not an normal anchor, its a media link + medialinks.put(normal, entry.getValue()); + } else { + hyperlinks.put(normal, entry.getValue()); + } + } + } + } + } + // finally add the images to the medialinks + i = image.entrySet().iterator(); + String normal; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + normal = urlNormalform(url); + if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException + } + expandHyperlinks(); + } + + /* private synchronized void resortLinks() { Enumeration e; String url; @@ -219,7 +264,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException } } - +*/ public synchronized void expandHyperlinks() { // we add artificial hyperlinks to the hyperlink set that can be calculated from diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java new file mode 100644 index 000000000..747f9c7eb --- /dev/null +++ b/source/de/anomic/plasma/plasmaParser.java @@ -0,0 +1,292 @@ +// plasmaParser.java +// ------------------------ +// part of YaCy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 12.04.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma; + +import de.anomic.htmlFilter.*; +import java.io.*; +import java.net.*; +import java.util.*; + +public class plasmaParser { + + public static String mediaExt = + "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + + "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj"; + + + public plasmaParser(File parserDispatcherPropertyFile) { + // this is only a dummy yet because we have only one parser... + + } + + public document parse(URL location, String mimeType, byte[] source) { + // make a scraper and transformer + htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); + OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); + try { + hfos.write(source); + return new document(new URL(urlNormalform(location)), + mimeType, null, null, scraper.getHeadline(), + null, null, + scraper.getText(), scraper.getAnchor(), scraper.getImage()); + } catch (Exception e) { + return null; + } + } + + public static String urlNormalform(URL url) { + if (url == null) return null; + return urlNormalform(url.toString()); + } + + public static String urlNormalform(String us) { + if (us == null) return null; + if (us.length() == 0) return null; + int p; + if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); + if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); + if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); + return us; + } + + + + public class document { + + URL location; // the source url + String mimeType; // mimeType as taken from http header + String keywords; // most resources provide a keyword field + String shortTitle; // a shortTitle mostly appears in the window header (border) + String longTitle; // the real title of the document, commonly h1-tags + String[] sections; // if present: more titles/headlines appearing in the document + String abstrct; // an abstract, if present: short content description + byte[] text; // the clear text, all that is visible + Map anchors; // all links embedded as clickeable entities (anchor tags) + Map images; // all visible pictures in document + // the anchors and images - Maps are URL-to-EntityDescription mappings. + // The EntityDescription appear either as visible text in anchors or as alternative + // text in image tags. + Map hyperlinks; + Map medialinks; + Map emaillinks; + + public document(URL location, String mimeType, + String keywords, String shortTitle, String longTitle, + String[] sections, String abstrct, + byte[] text, Map anchors, Map images) { + this.location = location; + this.mimeType = mimeType; + this.keywords = keywords; + this.shortTitle = shortTitle; + this.longTitle = longTitle; + this.sections = sections; + this.abstrct = abstrct; + this.text = text; + this.anchors = anchors; + this.images = images; + this.hyperlinks = null; + this.medialinks = null; + this.emaillinks = null; + } + + private String absolutePath(String relativePath) { + try { + return urlNormalform(new URL(location, relativePath)); + } catch (Exception e) { + return ""; + } + } + + public String getMainShortTitle() { + if (shortTitle != null) return shortTitle; else return longTitle; + } + + public String getMainLongTitle() { + if (longTitle != null) return longTitle; else return shortTitle; + } + + public String[] getSectionTitles() { + if (sections != null) return sections; else return new String[]{getMainLongTitle()}; + } + + public String getAbstract() { + if (abstrct != null) return abstrct; else return getMainLongTitle(); + } + + public byte[] getText() { + // returns only the clear (visible) text (not the source data) + return text; + } + + public Map getAnchors() { + // returns all links embedded as anchors (clickeable entities) + return anchors; + } + + public Map getImages() { + // returns all links enbedded as pictures (visible iin document) + return images; + } + + // the next three methods provide a calculated view on the getAnchors/getImages: + + public Map getHyperlinks() { + // this is a subset of the getAnchor-set: only links to other hyperrefs + if (hyperlinks == null) resortLinks(); + return hyperlinks; + } + + public Map getMedialinks() { + // this is partly subset of getAnchor and getImage: all non-hyperrefs + if (medialinks == null) resortLinks(); + return medialinks; + } + + public Map getEmaillinks() { + // this is part of the getAnchor-set: only links to email addresses + if (emaillinks == null) resortLinks(); + return emaillinks; + } + + private synchronized void resortLinks() { + Iterator i; + String url; + int extpos; + String ext; + i = anchors.entrySet().iterator(); + hyperlinks = new HashMap(); + medialinks = new HashMap(); + emaillinks = new HashMap(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + if ((url != null) && (url.startsWith("mailto:"))) { + emaillinks.put(url.substring(7), entry.getValue()); + } else { + extpos = url.lastIndexOf("."); + String normal; + if (extpos > 0) { + ext = url.substring(extpos).toLowerCase(); + normal = urlNormalform(url); + if (normal != null) { + if (mediaExt.indexOf(ext.substring(1)) >= 0) { + // this is not an normal anchor, its a media link + medialinks.put(normal, entry.getValue()); + } else { + hyperlinks.put(normal, entry.getValue()); + } + } + } + } + } + // finally add the images to the medialinks + i = images.entrySet().iterator(); + String normal; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + normal = urlNormalform(url); + if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException + } + expandHyperlinks(); + } + + + public synchronized void expandHyperlinks() { + // we add artificial hyperlinks to the hyperlink set that can be calculated from + // given hyperlinks and imagelinks + hyperlinks.putAll(allReflinks(hyperlinks)); + hyperlinks.putAll(allReflinks(medialinks)); + hyperlinks.putAll(allSubpaths(hyperlinks)); + hyperlinks.putAll(allSubpaths(medialinks)); + } + + } + + private static Map allReflinks(Map links) { + // we find all links that are part of a reference inside a url + HashMap v = new HashMap(); + Iterator i = links.keySet().iterator(); + String s; + int pos; + loop: while (i.hasNext()) { + s = (String) i.next(); + if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { + i.remove(); + s = s.substring(pos); + while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); + if (!(v.containsKey(s))) v.put(s, "ref"); + continue loop; + } + if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { + i.remove(); + s = "http:/" + s.substring(pos); + while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); + if (!(v.containsKey(s))) v.put(s, "ref"); + continue loop; + } + } + return v; + } + + private static Map allSubpaths(Map links) { + HashMap v = new HashMap(); + Iterator i = links.keySet().iterator(); + String s; + int pos; + while (i.hasNext()) { + s = (String) i.next(); + if (s.endsWith("/")) s = s.substring(0, s.length() - 1); + pos = s.lastIndexOf("/"); + while (pos > 8) { + s = s.substring(0, pos + 1); + if (!(v.containsKey(s))) v.put(s, "sub"); + s = s.substring(0, pos); + pos = s.lastIndexOf("/"); + } + } + return v; + } + +} diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index a05bca134..633f4610e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -482,19 +482,21 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi // put anchors on crawl stack if (((processCase == 4) || (processCase == 5)) && (entry.depth < entry.profile.generalDepth())) { - Properties hl = entry.scraper.getHyperlinks(); - Enumeration e = hl.propertyNames(); + Map hl = entry.scraper.getHyperlinks(); + Iterator i = hl.entrySet().iterator(); String nexturlstring; String rejectReason; int c = 0; - while (e.hasMoreElements()) { - nexturlstring = (String) e.nextElement(); - rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, hl.getProperty(nexturlstring), entry.lastModified, entry.depth + 1, entry.profile); + Map.Entry e; + while (i.hasNext()) { + e = (Map.Entry) i.next(); + nexturlstring = (String) e.getKey(); + rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile); if (rejectReason == null) { c++; } else { errorURL.newEntry(new URL(nexturlstring), entry.urlString, entry.initiator(), yacyCore.seedDB.mySeed.hash, - hl.getProperty(nexturlstring), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); + (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); } } log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
" + descr + " " + url + "