diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 0b18d59a3..6f23c5b77 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -149,13 +149,15 @@ public class CacheAdmin_p {
return out;
}
- private static String formatAnchor(Properties a) {
+ private static String formatAnchor(Map a) {
String out = "
";
- Enumeration e = a.keys();
+ Iterator i = a.entrySet().iterator();
String url, descr;
- while (e.hasMoreElements()) {
- url = (String) e.nextElement();
- descr = a.getProperty(url).trim();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ descr = ((String) entry.getValue()).trim();
if (descr.length() == 0) descr = "-";
out += "" + descr + " | " + url + " |
";
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index e5e4f49f8..48ca7c8d7 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -161,25 +161,70 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return image;
}
- public Properties getHyperlinks() {
+ public Map getHyperlinks() {
if (hyperlinks == null) resortLinks();
return hyperlinks;
}
- public Properties getMedialinks() {
+ public Map getMedialinks() {
if (medialinks == null) resortLinks();
return medialinks;
}
- public Properties getEmaillinks() {
+ public Map getEmaillinks() {
if (emaillinks == null) resortLinks();
return emaillinks;
}
- Properties hyperlinks = null;
- Properties medialinks = null;
- Properties emaillinks = null;
-
+ HashMap hyperlinks = null;
+ HashMap medialinks = null;
+ HashMap emaillinks = null;
+
+ private synchronized void resortLinks() {
+ Iterator i;
+ String url;
+ int extpos;
+ String ext;
+ i = anchor.entrySet().iterator();
+ hyperlinks = new HashMap();
+ medialinks = new HashMap();
+ emaillinks = new HashMap();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ if ((url != null) && (url.startsWith("mailto:"))) {
+ emaillinks.put(url.substring(7), entry.getValue());
+ } else {
+ extpos = url.lastIndexOf(".");
+ String normal;
+ if (extpos > 0) {
+ ext = url.substring(extpos).toLowerCase();
+ normal = urlNormalform(url);
+ if (normal != null) {
+ if (mediaExt.indexOf(ext.substring(1)) >= 0) {
+ // this is not an normal anchor, its a media link
+ medialinks.put(normal, entry.getValue());
+ } else {
+ hyperlinks.put(normal, entry.getValue());
+ }
+ }
+ }
+ }
+ }
+ // finally add the images to the medialinks
+ i = image.entrySet().iterator();
+ String normal;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ normal = urlNormalform(url);
+ if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+ }
+ expandHyperlinks();
+ }
+
+ /*
private synchronized void resortLinks() {
Enumeration e;
String url;
@@ -219,7 +264,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException
}
}
-
+*/
public synchronized void expandHyperlinks() {
// we add artificial hyperlinks to the hyperlink set that can be calculated from
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
new file mode 100644
index 000000000..747f9c7eb
--- /dev/null
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -0,0 +1,292 @@
+// plasmaParser.java
+// ------------------------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2005
+// last major change: 12.04.2005
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+
+package de.anomic.plasma;
+
+import de.anomic.htmlFilter.*;
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+public class plasmaParser {
+
+ public static String mediaExt =
+ "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
+ "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
+
+
+ public plasmaParser(File parserDispatcherPropertyFile) {
+ // this is only a dummy yet because we have only one parser...
+
+ }
+
+ public document parse(URL location, String mimeType, byte[] source) {
+ // make a scraper and transformer
+ htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+ OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+ try {
+ hfos.write(source);
+ return new document(new URL(urlNormalform(location)),
+ mimeType, null, null, scraper.getHeadline(),
+ null, null,
+ scraper.getText(), scraper.getAnchor(), scraper.getImage());
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ public static String urlNormalform(URL url) {
+ if (url == null) return null;
+ return urlNormalform(url.toString());
+ }
+
+ public static String urlNormalform(String us) {
+ if (us == null) return null;
+ if (us.length() == 0) return null;
+ int p;
+ if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
+ if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
+ if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
+ return us;
+ }
+
+
+
+ public class document {
+
+ URL location; // the source url
+ String mimeType; // mimeType as taken from http header
+ String keywords; // most resources provide a keyword field
+ String shortTitle; // a shortTitle mostly appears in the window header (border)
+ String longTitle; // the real title of the document, commonly h1-tags
+ String[] sections; // if present: more titles/headlines appearing in the document
+ String abstrct; // an abstract, if present: short content description
+ byte[] text; // the clear text, all that is visible
+ Map anchors; // all links embedded as clickeable entities (anchor tags)
+ Map images; // all visible pictures in document
+ // the anchors and images - Maps are URL-to-EntityDescription mappings.
+ // The EntityDescription appear either as visible text in anchors or as alternative
+ // text in image tags.
+ Map hyperlinks;
+ Map medialinks;
+ Map emaillinks;
+
+ public document(URL location, String mimeType,
+ String keywords, String shortTitle, String longTitle,
+ String[] sections, String abstrct,
+ byte[] text, Map anchors, Map images) {
+ this.location = location;
+ this.mimeType = mimeType;
+ this.keywords = keywords;
+ this.shortTitle = shortTitle;
+ this.longTitle = longTitle;
+ this.sections = sections;
+ this.abstrct = abstrct;
+ this.text = text;
+ this.anchors = anchors;
+ this.images = images;
+ this.hyperlinks = null;
+ this.medialinks = null;
+ this.emaillinks = null;
+ }
+
+ private String absolutePath(String relativePath) {
+ try {
+ return urlNormalform(new URL(location, relativePath));
+ } catch (Exception e) {
+ return "";
+ }
+ }
+
+ public String getMainShortTitle() {
+ if (shortTitle != null) return shortTitle; else return longTitle;
+ }
+
+ public String getMainLongTitle() {
+ if (longTitle != null) return longTitle; else return shortTitle;
+ }
+
+ public String[] getSectionTitles() {
+ if (sections != null) return sections; else return new String[]{getMainLongTitle()};
+ }
+
+ public String getAbstract() {
+ if (abstrct != null) return abstrct; else return getMainLongTitle();
+ }
+
+ public byte[] getText() {
+ // returns only the clear (visible) text (not the source data)
+ return text;
+ }
+
+ public Map getAnchors() {
+ // returns all links embedded as anchors (clickeable entities)
+ return anchors;
+ }
+
+ public Map getImages() {
+ // returns all links enbedded as pictures (visible iin document)
+ return images;
+ }
+
+ // the next three methods provide a calculated view on the getAnchors/getImages:
+
+ public Map getHyperlinks() {
+ // this is a subset of the getAnchor-set: only links to other hyperrefs
+ if (hyperlinks == null) resortLinks();
+ return hyperlinks;
+ }
+
+ public Map getMedialinks() {
+ // this is partly subset of getAnchor and getImage: all non-hyperrefs
+ if (medialinks == null) resortLinks();
+ return medialinks;
+ }
+
+ public Map getEmaillinks() {
+ // this is part of the getAnchor-set: only links to email addresses
+ if (emaillinks == null) resortLinks();
+ return emaillinks;
+ }
+
+ private synchronized void resortLinks() {
+ Iterator i;
+ String url;
+ int extpos;
+ String ext;
+ i = anchors.entrySet().iterator();
+ hyperlinks = new HashMap();
+ medialinks = new HashMap();
+ emaillinks = new HashMap();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ if ((url != null) && (url.startsWith("mailto:"))) {
+ emaillinks.put(url.substring(7), entry.getValue());
+ } else {
+ extpos = url.lastIndexOf(".");
+ String normal;
+ if (extpos > 0) {
+ ext = url.substring(extpos).toLowerCase();
+ normal = urlNormalform(url);
+ if (normal != null) {
+ if (mediaExt.indexOf(ext.substring(1)) >= 0) {
+ // this is not an normal anchor, its a media link
+ medialinks.put(normal, entry.getValue());
+ } else {
+ hyperlinks.put(normal, entry.getValue());
+ }
+ }
+ }
+ }
+ }
+ // finally add the images to the medialinks
+ i = images.entrySet().iterator();
+ String normal;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ normal = urlNormalform(url);
+ if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+ }
+ expandHyperlinks();
+ }
+
+
+ public synchronized void expandHyperlinks() {
+ // we add artificial hyperlinks to the hyperlink set that can be calculated from
+ // given hyperlinks and imagelinks
+ hyperlinks.putAll(allReflinks(hyperlinks));
+ hyperlinks.putAll(allReflinks(medialinks));
+ hyperlinks.putAll(allSubpaths(hyperlinks));
+ hyperlinks.putAll(allSubpaths(medialinks));
+ }
+
+ }
+
+ private static Map allReflinks(Map links) {
+ // we find all links that are part of a reference inside a url
+ HashMap v = new HashMap();
+ Iterator i = links.keySet().iterator();
+ String s;
+ int pos;
+ loop: while (i.hasNext()) {
+ s = (String) i.next();
+ if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
+ i.remove();
+ s = s.substring(pos);
+ while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
+ if (!(v.containsKey(s))) v.put(s, "ref");
+ continue loop;
+ }
+ if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
+ i.remove();
+ s = "http:/" + s.substring(pos);
+ while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
+ if (!(v.containsKey(s))) v.put(s, "ref");
+ continue loop;
+ }
+ }
+ return v;
+ }
+
+ private static Map allSubpaths(Map links) {
+ HashMap v = new HashMap();
+ Iterator i = links.keySet().iterator();
+ String s;
+ int pos;
+ while (i.hasNext()) {
+ s = (String) i.next();
+ if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
+ pos = s.lastIndexOf("/");
+ while (pos > 8) {
+ s = s.substring(0, pos + 1);
+ if (!(v.containsKey(s))) v.put(s, "sub");
+ s = s.substring(0, pos);
+ pos = s.lastIndexOf("/");
+ }
+ }
+ return v;
+ }
+
+}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index a05bca134..633f4610e 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -482,19 +482,21 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&
(entry.depth < entry.profile.generalDepth())) {
- Properties hl = entry.scraper.getHyperlinks();
- Enumeration e = hl.propertyNames();
+ Map hl = entry.scraper.getHyperlinks();
+ Iterator i = hl.entrySet().iterator();
String nexturlstring;
String rejectReason;
int c = 0;
- while (e.hasMoreElements()) {
- nexturlstring = (String) e.nextElement();
- rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, hl.getProperty(nexturlstring), entry.lastModified, entry.depth + 1, entry.profile);
+ Map.Entry e;
+ while (i.hasNext()) {
+ e = (Map.Entry) i.next();
+ nexturlstring = (String) e.getKey();
+ rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile);
if (rejectReason == null) {
c++;
} else {
errorURL.newEntry(new URL(nexturlstring), entry.urlString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
- hl.getProperty(nexturlstring), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
+ (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +